KVM: arm64: Move virt/kvm/arm to arch/arm64
authorMarc Zyngier <maz@kernel.org>
Wed, 13 May 2020 10:40:34 +0000 (11:40 +0100)
committerMarc Zyngier <maz@kernel.org>
Sat, 16 May 2020 14:03:59 +0000 (15:03 +0100)
Now that the 32bit KVM/arm host is a distant memory, let's move the
whole of the KVM/arm64 code into the arm64 tree.

As they said in the song: Welcome Home (Sanitarium).

Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200513104034.74741-1-maz@kernel.org
65 files changed:
MAINTAINERS
arch/arm64/kvm/Makefile
arch/arm64/kvm/aarch32.c [new file with mode: 0644]
arch/arm64/kvm/arch_timer.c [new file with mode: 0644]
arch/arm64/kvm/arm.c [new file with mode: 0644]
arch/arm64/kvm/handle_exit.c
arch/arm64/kvm/hyp/Makefile
arch/arm64/kvm/hyp/aarch32.c [new file with mode: 0644]
arch/arm64/kvm/hyp/timer-sr.c [new file with mode: 0644]
arch/arm64/kvm/hyp/vgic-v3-sr.c [new file with mode: 0644]
arch/arm64/kvm/hypercalls.c [new file with mode: 0644]
arch/arm64/kvm/mmio.c [new file with mode: 0644]
arch/arm64/kvm/mmu.c [new file with mode: 0644]
arch/arm64/kvm/perf.c [new file with mode: 0644]
arch/arm64/kvm/pmu-emul.c [new file with mode: 0644]
arch/arm64/kvm/psci.c [new file with mode: 0644]
arch/arm64/kvm/pvtime.c [new file with mode: 0644]
arch/arm64/kvm/trace.h
arch/arm64/kvm/trace_arm.h [new file with mode: 0644]
arch/arm64/kvm/trace_handle_exit.h [new file with mode: 0644]
arch/arm64/kvm/vgic-sys-reg-v3.c
arch/arm64/kvm/vgic/trace.h [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-debug.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-init.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-irqfd.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-its.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-kvm-device.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-mmio-v2.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-mmio-v3.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-mmio.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-mmio.h [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-v2.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-v3.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic-v4.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic.c [new file with mode: 0644]
arch/arm64/kvm/vgic/vgic.h [new file with mode: 0644]
virt/kvm/arm/aarch32.c [deleted file]
virt/kvm/arm/arch_timer.c [deleted file]
virt/kvm/arm/arm.c [deleted file]
virt/kvm/arm/hyp/aarch32.c [deleted file]
virt/kvm/arm/hyp/timer-sr.c [deleted file]
virt/kvm/arm/hyp/vgic-v3-sr.c [deleted file]
virt/kvm/arm/hypercalls.c [deleted file]
virt/kvm/arm/mmio.c [deleted file]
virt/kvm/arm/mmu.c [deleted file]
virt/kvm/arm/perf.c [deleted file]
virt/kvm/arm/pmu.c [deleted file]
virt/kvm/arm/psci.c [deleted file]
virt/kvm/arm/pvtime.c [deleted file]
virt/kvm/arm/trace.h [deleted file]
virt/kvm/arm/vgic/trace.h [deleted file]
virt/kvm/arm/vgic/vgic-debug.c [deleted file]
virt/kvm/arm/vgic/vgic-init.c [deleted file]
virt/kvm/arm/vgic/vgic-irqfd.c [deleted file]
virt/kvm/arm/vgic/vgic-its.c [deleted file]
virt/kvm/arm/vgic/vgic-kvm-device.c [deleted file]
virt/kvm/arm/vgic/vgic-mmio-v2.c [deleted file]
virt/kvm/arm/vgic/vgic-mmio-v3.c [deleted file]
virt/kvm/arm/vgic/vgic-mmio.c [deleted file]
virt/kvm/arm/vgic/vgic-mmio.h [deleted file]
virt/kvm/arm/vgic/vgic-v2.c [deleted file]
virt/kvm/arm/vgic/vgic-v3.c [deleted file]
virt/kvm/arm/vgic/vgic-v4.c [deleted file]
virt/kvm/arm/vgic/vgic.c [deleted file]
virt/kvm/arm/vgic/vgic.h [deleted file]

index 091ec22c1a23f174ef376bdd953b30096661ff03..6c5b928989ed70b88186cd9208623f569cd53264 100644 (file)
@@ -9295,7 +9295,6 @@ F:        arch/arm64/include/asm/kvm*
 F:     arch/arm64/include/uapi/asm/kvm*
 F:     arch/arm64/kvm/
 F:     include/kvm/arm_*
-F:     virt/kvm/arm/
 
 KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips)
 L:     linux-mips@vger.kernel.org
index 5ffbdc39e780e798b38c6da40cd6b864b7cb9b33..7a3768538343133674aa15d25a8c65679f043f39 100644 (file)
@@ -3,37 +3,37 @@
 # Makefile for Kernel-based Virtual Machine module
 #
 
-ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic
+ccflags-y += -I $(srctree)/$(src)
 
 KVM=../../../virt/kvm
 
 obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
 obj-$(CONFIG_KVM_ARM_HOST) += hyp/
 
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hypercalls.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/pvtime.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/eventfd.o $(KVM)/vfio.o $(KVM)/irqchip.o
+kvm-$(CONFIG_KVM_ARM_HOST) += arm.o mmu.o mmio.o
+kvm-$(CONFIG_KVM_ARM_HOST) += psci.o perf.o
+kvm-$(CONFIG_KVM_ARM_HOST) += hypercalls.o
+kvm-$(CONFIG_KVM_ARM_HOST) += pvtime.o
 
 kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
 kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
 kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o fpsimd.o pmu.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o
+kvm-$(CONFIG_KVM_ARM_HOST) += aarch32.o
+kvm-$(CONFIG_KVM_ARM_HOST) += arch_timer.o
+kvm-$(CONFIG_KVM_ARM_PMU)  += pmu-emul.o
 
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v4.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-debug.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/irqchip.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
-kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-init.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-irqfd.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v2.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v3.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v4.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio-v2.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio-v3.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-kvm-device.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-its.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-debug.o
diff --git a/arch/arm64/kvm/aarch32.c b/arch/arm64/kvm/aarch32.c
new file mode 100644 (file)
index 0000000..0a356aa
--- /dev/null
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * (not much of an) Emulation layer for 32bit guests.
+ *
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * based on arch/arm/kvm/emulate.c
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+#define DFSR_FSC_EXTABT_LPAE   0x10
+#define DFSR_FSC_EXTABT_nLPAE  0x08
+#define DFSR_LPAE              BIT(9)
+
+/*
+ * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
+ */
+static const u8 return_offsets[8][2] = {
+       [0] = { 0, 0 },         /* Reset, unused */
+       [1] = { 4, 2 },         /* Undefined */
+       [2] = { 0, 0 },         /* SVC, unused */
+       [3] = { 4, 4 },         /* Prefetch abort */
+       [4] = { 8, 8 },         /* Data abort */
+       [5] = { 0, 0 },         /* HVC, unused */
+       [6] = { 4, 4 },         /* IRQ, unused */
+       [7] = { 4, 4 },         /* FIQ, unused */
+};
+
+/*
+ * When an exception is taken, most CPSR fields are left unchanged in the
+ * handler. However, some are explicitly overridden (e.g. M[4:0]).
+ *
+ * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with
+ * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was
+ * obsoleted by the ARMv7 virtualization extensions and is RES0.
+ *
+ * For the SPSR layout seen from AArch32, see:
+ * - ARM DDI 0406C.d, page B1-1148
+ * - ARM DDI 0487E.a, page G8-6264
+ *
+ * For the SPSR_ELx layout for AArch32 seen from AArch64, see:
+ * - ARM DDI 0487E.a, page C5-426
+ *
+ * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from
+ * MSB to LSB.
+ */
+static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode)
+{
+       u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
+       unsigned long old, new;
+
+       old = *vcpu_cpsr(vcpu);
+       new = 0;
+
+       new |= (old & PSR_AA32_N_BIT);
+       new |= (old & PSR_AA32_Z_BIT);
+       new |= (old & PSR_AA32_C_BIT);
+       new |= (old & PSR_AA32_V_BIT);
+       new |= (old & PSR_AA32_Q_BIT);
+
+       // CPSR.IT[7:0] are set to zero upon any exception
+       // See ARM DDI 0487E.a, section G1.12.3
+       // See ARM DDI 0406C.d, section B1.8.3
+
+       new |= (old & PSR_AA32_DIT_BIT);
+
+       // CPSR.SSBS is set to SCTLR.DSSBS upon any exception
+       // See ARM DDI 0487E.a, page G8-6244
+       if (sctlr & BIT(31))
+               new |= PSR_AA32_SSBS_BIT;
+
+       // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0
+       // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented
+       // See ARM DDI 0487E.a, page G8-6246
+       new |= (old & PSR_AA32_PAN_BIT);
+       if (!(sctlr & BIT(23)))
+               new |= PSR_AA32_PAN_BIT;
+
+       // SS does not exist in AArch32, so ignore
+
+       // CPSR.IL is set to zero upon any exception
+       // See ARM DDI 0487E.a, page G1-5527
+
+       new |= (old & PSR_AA32_GE_MASK);
+
+       // CPSR.IT[7:0] are set to zero upon any exception
+       // See prior comment above
+
+       // CPSR.E is set to SCTLR.EE upon any exception
+       // See ARM DDI 0487E.a, page G8-6245
+       // See ARM DDI 0406C.d, page B4-1701
+       if (sctlr & BIT(25))
+               new |= PSR_AA32_E_BIT;
+
+       // CPSR.A is unchanged upon an exception to Undefined, Supervisor
+       // CPSR.A is set upon an exception to other modes
+       // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+       // See ARM DDI 0406C.d, page B1-1182
+       new |= (old & PSR_AA32_A_BIT);
+       if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC)
+               new |= PSR_AA32_A_BIT;
+
+       // CPSR.I is set upon any exception
+       // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+       // See ARM DDI 0406C.d, page B1-1182
+       new |= PSR_AA32_I_BIT;
+
+       // CPSR.F is set upon an exception to FIQ
+       // CPSR.F is unchanged upon an exception to other modes
+       // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+       // See ARM DDI 0406C.d, page B1-1182
+       new |= (old & PSR_AA32_F_BIT);
+       if (mode == PSR_AA32_MODE_FIQ)
+               new |= PSR_AA32_F_BIT;
+
+       // CPSR.T is set to SCTLR.TE upon any exception
+       // See ARM DDI 0487E.a, page G8-5514
+       // See ARM DDI 0406C.d, page B1-1181
+       if (sctlr & BIT(30))
+               new |= PSR_AA32_T_BIT;
+
+       new |= mode;
+
+       return new;
+}
+
+static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
+{
+       unsigned long spsr = *vcpu_cpsr(vcpu);
+       bool is_thumb = (spsr & PSR_AA32_T_BIT);
+       u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
+       u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
+
+       *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode);
+
+       /* Note: These now point to the banked copies */
+       vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr));
+       *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
+
+       /* Branch to exception vector */
+       if (sctlr & (1 << 13))
+               vect_offset += 0xffff0000;
+       else /* always have security exceptions */
+               vect_offset += vcpu_cp15(vcpu, c12_VBAR);
+
+       *vcpu_pc(vcpu) = vect_offset;
+}
+
+void kvm_inject_undef32(struct kvm_vcpu *vcpu)
+{
+       prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4);
+}
+
+/*
+ * Modelled after TakeDataAbortException() and TakePrefetchAbortException
+ * pseudocode.
+ */
+static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
+                        unsigned long addr)
+{
+       u32 vect_offset;
+       u32 *far, *fsr;
+       bool is_lpae;
+
+       if (is_pabt) {
+               vect_offset = 12;
+               far = &vcpu_cp15(vcpu, c6_IFAR);
+               fsr = &vcpu_cp15(vcpu, c5_IFSR);
+       } else { /* !iabt */
+               vect_offset = 16;
+               far = &vcpu_cp15(vcpu, c6_DFAR);
+               fsr = &vcpu_cp15(vcpu, c5_DFSR);
+       }
+
+       prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset);
+
+       *far = addr;
+
+       /* Give the guest an IMPLEMENTATION DEFINED exception */
+       is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
+       if (is_lpae) {
+               *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE;
+       } else {
+               /* no need to shuffle FS[4] into DFSR[10] as its 0 */
+               *fsr = DFSR_FSC_EXTABT_nLPAE;
+       }
+}
+
+void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+       inject_abt32(vcpu, false, addr);
+}
+
+void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+       inject_abt32(vcpu, true, addr);
+}
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
new file mode 100644 (file)
index 0000000..93bd59b
--- /dev/null
@@ -0,0 +1,1180 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/uaccess.h>
+
+#include <clocksource/arm_arch_timer.h>
+#include <asm/arch_timer.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+#include <kvm/arm_vgic.h>
+#include <kvm/arm_arch_timer.h>
+
+#include "trace.h"
+
+static struct timecounter *timecounter;
+static unsigned int host_vtimer_irq;
+static unsigned int host_ptimer_irq;
+static u32 host_vtimer_irq_flags;
+static u32 host_ptimer_irq_flags;
+
+static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
+
+static const struct kvm_irq_level default_ptimer_irq = {
+       .irq    = 30,
+       .level  = 1,
+};
+
+static const struct kvm_irq_level default_vtimer_irq = {
+       .irq    = 27,
+       .level  = 1,
+};
+
+static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
+static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
+                                struct arch_timer_context *timer_ctx);
+static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
+static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
+                               struct arch_timer_context *timer,
+                               enum kvm_arch_timer_regs treg,
+                               u64 val);
+static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
+                             struct arch_timer_context *timer,
+                             enum kvm_arch_timer_regs treg);
+
+u64 kvm_phys_timer_read(void)
+{
+       return timecounter->cc->read(timecounter->cc);
+}
+
+static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
+{
+       if (has_vhe()) {
+               map->direct_vtimer = vcpu_vtimer(vcpu);
+               map->direct_ptimer = vcpu_ptimer(vcpu);
+               map->emul_ptimer = NULL;
+       } else {
+               map->direct_vtimer = vcpu_vtimer(vcpu);
+               map->direct_ptimer = NULL;
+               map->emul_ptimer = vcpu_ptimer(vcpu);
+       }
+
+       trace_kvm_get_timer_map(vcpu->vcpu_id, map);
+}
+
+static inline bool userspace_irqchip(struct kvm *kvm)
+{
+       return static_branch_unlikely(&userspace_irqchip_in_use) &&
+               unlikely(!irqchip_in_kernel(kvm));
+}
+
+static void soft_timer_start(struct hrtimer *hrt, u64 ns)
+{
+       hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
+                     HRTIMER_MODE_ABS_HARD);
+}
+
+static void soft_timer_cancel(struct hrtimer *hrt)
+{
+       hrtimer_cancel(hrt);
+}
+
+static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
+{
+       struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
+       struct arch_timer_context *ctx;
+       struct timer_map map;
+
+       /*
+        * We may see a timer interrupt after vcpu_put() has been called which
+        * sets the CPU's vcpu pointer to NULL, because even though the timer
+        * has been disabled in timer_save_state(), the hardware interrupt
+        * signal may not have been retired from the interrupt controller yet.
+        */
+       if (!vcpu)
+               return IRQ_HANDLED;
+
+       get_timer_map(vcpu, &map);
+
+       if (irq == host_vtimer_irq)
+               ctx = map.direct_vtimer;
+       else
+               ctx = map.direct_ptimer;
+
+       if (kvm_timer_should_fire(ctx))
+               kvm_timer_update_irq(vcpu, true, ctx);
+
+       if (userspace_irqchip(vcpu->kvm) &&
+           !static_branch_unlikely(&has_gic_active_state))
+               disable_percpu_irq(host_vtimer_irq);
+
+       return IRQ_HANDLED;
+}
+
+static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
+{
+       u64 cval, now;
+
+       cval = timer_ctx->cnt_cval;
+       now = kvm_phys_timer_read() - timer_ctx->cntvoff;
+
+       if (now < cval) {
+               u64 ns;
+
+               ns = cyclecounter_cyc2ns(timecounter->cc,
+                                        cval - now,
+                                        timecounter->mask,
+                                        &timecounter->frac);
+               return ns;
+       }
+
+       return 0;
+}
+
+static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
+{
+       WARN_ON(timer_ctx && timer_ctx->loaded);
+       return timer_ctx &&
+              !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
+               (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE);
+}
+
+/*
+ * Returns the earliest expiration time in ns among guest timers.
+ * Note that it will return 0 if none of timers can fire.
+ */
+static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
+{
+       u64 min_delta = ULLONG_MAX;
+       int i;
+
+       for (i = 0; i < NR_KVM_TIMERS; i++) {
+               struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i];
+
+               WARN(ctx->loaded, "timer %d loaded\n", i);
+               if (kvm_timer_irq_can_fire(ctx))
+                       min_delta = min(min_delta, kvm_timer_compute_delta(ctx));
+       }
+
+       /* If none of timers can fire, then return 0 */
+       if (min_delta == ULLONG_MAX)
+               return 0;
+
+       return min_delta;
+}
+
+static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
+{
+       struct arch_timer_cpu *timer;
+       struct kvm_vcpu *vcpu;
+       u64 ns;
+
+       timer = container_of(hrt, struct arch_timer_cpu, bg_timer);
+       vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);
+
+       /*
+        * Check that the timer has really expired from the guest's
+        * PoV (NTP on the host may have forced it to expire
+        * early). If we should have slept longer, restart it.
+        */
+       ns = kvm_timer_earliest_exp(vcpu);
+       if (unlikely(ns)) {
+               hrtimer_forward_now(hrt, ns_to_ktime(ns));
+               return HRTIMER_RESTART;
+       }
+
+       kvm_vcpu_wake_up(vcpu);
+       return HRTIMER_NORESTART;
+}
+
+static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt)
+{
+       struct arch_timer_context *ctx;
+       struct kvm_vcpu *vcpu;
+       u64 ns;
+
+       ctx = container_of(hrt, struct arch_timer_context, hrtimer);
+       vcpu = ctx->vcpu;
+
+       trace_kvm_timer_hrtimer_expire(ctx);
+
+       /*
+        * Check that the timer has really expired from the guest's
+        * PoV (NTP on the host may have forced it to expire
+        * early). If not ready, schedule for a later time.
+        */
+       ns = kvm_timer_compute_delta(ctx);
+       if (unlikely(ns)) {
+               hrtimer_forward_now(hrt, ns_to_ktime(ns));
+               return HRTIMER_RESTART;
+       }
+
+       kvm_timer_update_irq(vcpu, true, ctx);
+       return HRTIMER_NORESTART;
+}
+
+static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
+{
+       enum kvm_arch_timers index;
+       u64 cval, now;
+
+       if (!timer_ctx)
+               return false;
+
+       index = arch_timer_ctx_index(timer_ctx);
+
+       if (timer_ctx->loaded) {
+               u32 cnt_ctl = 0;
+
+               switch (index) {
+               case TIMER_VTIMER:
+                       cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
+                       break;
+               case TIMER_PTIMER:
+                       cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
+                       break;
+               case NR_KVM_TIMERS:
+                       /* GCC is braindead */
+                       cnt_ctl = 0;
+                       break;
+               }
+
+               return  (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
+                       (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
+                      !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
+       }
+
+       if (!kvm_timer_irq_can_fire(timer_ctx))
+               return false;
+
+       cval = timer_ctx->cnt_cval;
+       now = kvm_phys_timer_read() - timer_ctx->cntvoff;
+
+       return cval <= now;
+}
+
+bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
+{
+       struct timer_map map;
+
+       get_timer_map(vcpu, &map);
+
+       return kvm_timer_should_fire(map.direct_vtimer) ||
+              kvm_timer_should_fire(map.direct_ptimer) ||
+              kvm_timer_should_fire(map.emul_ptimer);
+}
+
+/*
+ * Reflect the timer output level into the kvm_run structure
+ */
+void kvm_timer_update_run(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+       struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+       struct kvm_sync_regs *regs = &vcpu->run->s.regs;
+
+       /* Populate the device bitmap with the timer states */
+       regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER |
+                                   KVM_ARM_DEV_EL1_PTIMER);
+       if (kvm_timer_should_fire(vtimer))
+               regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER;
+       if (kvm_timer_should_fire(ptimer))
+               regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
+}
+
+static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
+                                struct arch_timer_context *timer_ctx)
+{
+       int ret;
+
+       timer_ctx->irq.level = new_level;
+       trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
+                                  timer_ctx->irq.level);
+
+       if (!userspace_irqchip(vcpu->kvm)) {
+               ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+                                         timer_ctx->irq.irq,
+                                         timer_ctx->irq.level,
+                                         timer_ctx);
+               WARN_ON(ret);
+       }
+}
+
+/* Only called for a fully emulated timer */
+static void timer_emulate(struct arch_timer_context *ctx)
+{
+       bool should_fire = kvm_timer_should_fire(ctx);
+
+       trace_kvm_timer_emulate(ctx, should_fire);
+
+       if (should_fire != ctx->irq.level) {
+               kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
+               return;
+       }
+
+       /*
+        * If the timer can fire now, we don't need to have a soft timer
+        * scheduled for the future.  If the timer cannot fire at all,
+        * then we also don't need a soft timer.
+        */
+       if (!kvm_timer_irq_can_fire(ctx)) {
+               soft_timer_cancel(&ctx->hrtimer);
+               return;
+       }
+
+       soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx));
+}
+
+static void timer_save_state(struct arch_timer_context *ctx)
+{
+       struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+       enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
+       unsigned long flags;
+
+       if (!timer->enabled)
+               return;
+
+       local_irq_save(flags);
+
+       if (!ctx->loaded)
+               goto out;
+
+       switch (index) {
+       case TIMER_VTIMER:
+               ctx->cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
+               ctx->cnt_cval = read_sysreg_el0(SYS_CNTV_CVAL);
+
+               /* Disable the timer */
+               write_sysreg_el0(0, SYS_CNTV_CTL);
+               isb();
+
+               break;
+       case TIMER_PTIMER:
+               ctx->cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
+               ctx->cnt_cval = read_sysreg_el0(SYS_CNTP_CVAL);
+
+               /* Disable the timer */
+               write_sysreg_el0(0, SYS_CNTP_CTL);
+               isb();
+
+               break;
+       case NR_KVM_TIMERS:
+               BUG();
+       }
+
+       trace_kvm_timer_save_state(ctx);
+
+       ctx->loaded = false;
+out:
+       local_irq_restore(flags);
+}
+
+/*
+ * Schedule the background timer before calling kvm_vcpu_block, so that this
+ * thread is removed from its waitqueue and made runnable when there's a timer
+ * interrupt to handle.
+ */
+static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+       struct timer_map map;
+
+       get_timer_map(vcpu, &map);
+
+       /*
+        * If no timers are capable of raising interrupts (disabled or
+        * masked), then there's no more work for us to do.
+        */
+       if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
+           !kvm_timer_irq_can_fire(map.direct_ptimer) &&
+           !kvm_timer_irq_can_fire(map.emul_ptimer))
+               return;
+
+       /*
+        * At least one guest time will expire. Schedule a background timer.
+        * Set the earliest expiration time among the guest timers.
+        */
+       soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
+}
+
+static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+
+       soft_timer_cancel(&timer->bg_timer);
+}
+
+static void timer_restore_state(struct arch_timer_context *ctx)
+{
+       struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+       enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
+       unsigned long flags;
+
+       if (!timer->enabled)
+               return;
+
+       local_irq_save(flags);
+
+       if (ctx->loaded)
+               goto out;
+
+       switch (index) {
+       case TIMER_VTIMER:
+               write_sysreg_el0(ctx->cnt_cval, SYS_CNTV_CVAL);
+               isb();
+               write_sysreg_el0(ctx->cnt_ctl, SYS_CNTV_CTL);
+               break;
+       case TIMER_PTIMER:
+               write_sysreg_el0(ctx->cnt_cval, SYS_CNTP_CVAL);
+               isb();
+               write_sysreg_el0(ctx->cnt_ctl, SYS_CNTP_CTL);
+               break;
+       case NR_KVM_TIMERS:
+               BUG();
+       }
+
+       trace_kvm_timer_restore_state(ctx);
+
+       ctx->loaded = true;
+out:
+       local_irq_restore(flags);
+}
+
+static void set_cntvoff(u64 cntvoff)
+{
+       u32 low = lower_32_bits(cntvoff);
+       u32 high = upper_32_bits(cntvoff);
+
+       /*
+        * Since kvm_call_hyp doesn't fully support the ARM PCS especially on
+        * 32-bit systems, but rather passes register by register shifted one
+        * place (we put the function address in r0/x0), we cannot simply pass
+        * a 64-bit value as an argument, but have to split the value in two
+        * 32-bit halves.
+        */
+       kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
+}
+
+static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
+{
+       int r;
+       r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active);
+       WARN_ON(r);
+}
+
+static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
+{
+       struct kvm_vcpu *vcpu = ctx->vcpu;
+       bool phys_active = false;
+
+       /*
+        * Update the timer output so that it is likely to match the
+        * state we're about to restore. If the timer expires between
+        * this point and the register restoration, we'll take the
+        * interrupt anyway.
+        */
+       kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);
+
+       if (irqchip_in_kernel(vcpu->kvm))
+               phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq);
+
+       phys_active |= ctx->irq.level;
+
+       set_timer_irq_phys_active(ctx, phys_active);
+}
+
+static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+       /*
+        * Update the timer output so that it is likely to match the
+        * state we're about to restore. If the timer expires between
+        * this point and the register restoration, we'll take the
+        * interrupt anyway.
+        */
+       kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer);
+
+       /*
+        * When using a userspace irqchip with the architected timers and a
+        * host interrupt controller that doesn't support an active state, we
+        * must still prevent continuously exiting from the guest, and
+        * therefore mask the physical interrupt by disabling it on the host
+        * interrupt controller when the virtual level is high, such that the
+        * guest can make forward progress.  Once we detect the output level
+        * being de-asserted, we unmask the interrupt again so that we exit
+        * from the guest when the timer fires.
+        */
+       if (vtimer->irq.level)
+               disable_percpu_irq(host_vtimer_irq);
+       else
+               enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
+}
+
+void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+       struct timer_map map;
+
+       if (unlikely(!timer->enabled))
+               return;
+
+       get_timer_map(vcpu, &map);
+
+       if (static_branch_likely(&has_gic_active_state)) {
+               kvm_timer_vcpu_load_gic(map.direct_vtimer);
+               if (map.direct_ptimer)
+                       kvm_timer_vcpu_load_gic(map.direct_ptimer);
+       } else {
+               kvm_timer_vcpu_load_nogic(vcpu);
+       }
+
+       set_cntvoff(map.direct_vtimer->cntvoff);
+
+       kvm_timer_unblocking(vcpu);
+
+       timer_restore_state(map.direct_vtimer);
+       if (map.direct_ptimer)
+               timer_restore_state(map.direct_ptimer);
+
+       if (map.emul_ptimer)
+               timer_emulate(map.emul_ptimer);
+}
+
+bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+       struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+       struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
+       bool vlevel, plevel;
+
+       if (likely(irqchip_in_kernel(vcpu->kvm)))
+               return false;
+
+       vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER;
+       plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER;
+
+       return kvm_timer_should_fire(vtimer) != vlevel ||
+              kvm_timer_should_fire(ptimer) != plevel;
+}
+
+void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+       struct timer_map map;
+
+       if (unlikely(!timer->enabled))
+               return;
+
+       get_timer_map(vcpu, &map);
+
+       timer_save_state(map.direct_vtimer);
+       if (map.direct_ptimer)
+               timer_save_state(map.direct_ptimer);
+
+       /*
+        * Cancel soft timer emulation, because the only case where we
+        * need it after a vcpu_put is in the context of a sleeping VCPU, and
+        * in that case we already factor in the deadline for the physical
+        * timer when scheduling the bg_timer.
+        *
+        * In any case, we re-schedule the hrtimer for the physical timer when
+        * coming back to the VCPU thread in kvm_timer_vcpu_load().
+        */
+       if (map.emul_ptimer)
+               soft_timer_cancel(&map.emul_ptimer->hrtimer);
+
+       if (swait_active(kvm_arch_vcpu_wq(vcpu)))
+               kvm_timer_blocking(vcpu);
+
+       /*
+        * The kernel may decide to run userspace after calling vcpu_put, so
+        * we reset cntvoff to 0 to ensure a consistent read between user
+        * accesses to the virtual counter and kernel access to the physical
+        * counter of non-VHE case. For VHE, the virtual counter uses a fixed
+        * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
+        */
+       set_cntvoff(0);
+}
+
+/*
+ * With a userspace irqchip we have to check if the guest de-asserted the
+ * timer and if so, unmask the timer irq signal on the host interrupt
+ * controller to ensure that we see future timer signals.
+ */
+static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+       if (!kvm_timer_should_fire(vtimer)) {
+               kvm_timer_update_irq(vcpu, false, vtimer);
+               if (static_branch_likely(&has_gic_active_state))
+                       set_timer_irq_phys_active(vtimer, false);
+               else
+                       enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
+       }
+}
+
+void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+
+       if (unlikely(!timer->enabled))
+               return;
+
+       if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+               unmask_vtimer_irq_user(vcpu);
+}
+
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+       struct timer_map map;
+
+       get_timer_map(vcpu, &map);
+
+       /*
+        * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
+        * and to 0 for ARMv7.  We provide an implementation that always
+        * resets the timer to be disabled and unmasked and is compliant with
+        * the ARMv7 architecture.
+        */
+       vcpu_vtimer(vcpu)->cnt_ctl = 0;
+       vcpu_ptimer(vcpu)->cnt_ctl = 0;
+
+       if (timer->enabled) {
+               kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu));
+               kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu));
+
+               if (irqchip_in_kernel(vcpu->kvm)) {
+                       kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
+                       if (map.direct_ptimer)
+                               kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq);
+               }
+       }
+
+       if (map.emul_ptimer)
+               soft_timer_cancel(&map.emul_ptimer->hrtimer);
+
+       return 0;
+}
+
+/* Make the updates of cntvoff for all vtimer contexts atomic */
+static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
+{
+       int i;
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_vcpu *tmp;
+
+       mutex_lock(&kvm->lock);
+       kvm_for_each_vcpu(i, tmp, kvm)
+               vcpu_vtimer(tmp)->cntvoff = cntvoff;
+
+       /*
+        * When called from the vcpu create path, the CPU being created is not
+        * included in the loop above, so we just set it here as well.
+        */
+       vcpu_vtimer(vcpu)->cntvoff = cntvoff;
+       mutex_unlock(&kvm->lock);
+}
+
+void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+       struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+
+       /* Synchronize cntvoff across all vtimers of a VM. */
+       update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
+       ptimer->cntvoff = 0;
+
+       hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+       timer->bg_timer.function = kvm_bg_timer_expire;
+
+       hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+       hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+       vtimer->hrtimer.function = kvm_hrtimer_expire;
+       ptimer->hrtimer.function = kvm_hrtimer_expire;
+
+       vtimer->irq.irq = default_vtimer_irq.irq;
+       ptimer->irq.irq = default_ptimer_irq.irq;
+
+       vtimer->host_timer_irq = host_vtimer_irq;
+       ptimer->host_timer_irq = host_ptimer_irq;
+
+       vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
+       ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
+
+       vtimer->vcpu = vcpu;
+       ptimer->vcpu = vcpu;
+}
+
+static void kvm_timer_init_interrupt(void *info)
+{
+       enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
+       enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
+}
+
+int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
+{
+       struct arch_timer_context *timer;
+
+       switch (regid) {
+       case KVM_REG_ARM_TIMER_CTL:
+               timer = vcpu_vtimer(vcpu);
+               kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
+               break;
+       case KVM_REG_ARM_TIMER_CNT:
+               timer = vcpu_vtimer(vcpu);
+               update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
+               break;
+       case KVM_REG_ARM_TIMER_CVAL:
+               timer = vcpu_vtimer(vcpu);
+               kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
+               break;
+       case KVM_REG_ARM_PTIMER_CTL:
+               timer = vcpu_ptimer(vcpu);
+               kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
+               break;
+       case KVM_REG_ARM_PTIMER_CVAL:
+               timer = vcpu_ptimer(vcpu);
+               kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
+               break;
+
+       default:
+               return -1;
+       }
+
+       return 0;
+}
+
+static u64 read_timer_ctl(struct arch_timer_context *timer)
+{
+       /*
+        * Set ISTATUS bit if it's expired.
+        * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
+        * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
+        * regardless of ENABLE bit for our implementation convenience.
+        */
+       if (!kvm_timer_compute_delta(timer))
+               return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT;
+       else
+               return timer->cnt_ctl;
+}
+
+u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
+{
+       switch (regid) {
+       case KVM_REG_ARM_TIMER_CTL:
+               return kvm_arm_timer_read(vcpu,
+                                         vcpu_vtimer(vcpu), TIMER_REG_CTL);
+       case KVM_REG_ARM_TIMER_CNT:
+               return kvm_arm_timer_read(vcpu,
+                                         vcpu_vtimer(vcpu), TIMER_REG_CNT);
+       case KVM_REG_ARM_TIMER_CVAL:
+               return kvm_arm_timer_read(vcpu,
+                                         vcpu_vtimer(vcpu), TIMER_REG_CVAL);
+       case KVM_REG_ARM_PTIMER_CTL:
+               return kvm_arm_timer_read(vcpu,
+                                         vcpu_ptimer(vcpu), TIMER_REG_CTL);
+       case KVM_REG_ARM_PTIMER_CNT:
+               return kvm_arm_timer_read(vcpu,
+                                         vcpu_ptimer(vcpu), TIMER_REG_CNT);
+       case KVM_REG_ARM_PTIMER_CVAL:
+               return kvm_arm_timer_read(vcpu,
+                                         vcpu_ptimer(vcpu), TIMER_REG_CVAL);
+       }
+       return (u64)-1;
+}
+
+static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
+                             struct arch_timer_context *timer,
+                             enum kvm_arch_timer_regs treg)
+{
+       u64 val;
+
+       switch (treg) {
+       case TIMER_REG_TVAL:
+               val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff;
+               val &= lower_32_bits(val);
+               break;
+
+       case TIMER_REG_CTL:
+               val = read_timer_ctl(timer);
+               break;
+
+       case TIMER_REG_CVAL:
+               val = timer->cnt_cval;
+               break;
+
+       case TIMER_REG_CNT:
+               val = kvm_phys_timer_read() - timer->cntvoff;
+               break;
+
+       default:
+               BUG();
+       }
+
+       return val;
+}
+
+u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
+                             enum kvm_arch_timers tmr,
+                             enum kvm_arch_timer_regs treg)
+{
+       u64 val;
+
+       preempt_disable();
+       kvm_timer_vcpu_put(vcpu);
+
+       val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg);
+
+       kvm_timer_vcpu_load(vcpu);
+       preempt_enable();
+
+       return val;
+}
+
+static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
+                               struct arch_timer_context *timer,
+                               enum kvm_arch_timer_regs treg,
+                               u64 val)
+{
+       switch (treg) {
+       case TIMER_REG_TVAL:
+               timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + (s32)val;
+               break;
+
+       case TIMER_REG_CTL:
+               timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT;
+               break;
+
+       case TIMER_REG_CVAL:
+               timer->cnt_cval = val;
+               break;
+
+       default:
+               BUG();
+       }
+}
+
+void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
+                               enum kvm_arch_timers tmr,
+                               enum kvm_arch_timer_regs treg,
+                               u64 val)
+{
+       preempt_disable();
+       kvm_timer_vcpu_put(vcpu);
+
+       kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
+
+       kvm_timer_vcpu_load(vcpu);
+       preempt_enable();
+}
+
+static int kvm_timer_starting_cpu(unsigned int cpu)
+{
+       kvm_timer_init_interrupt(NULL);
+       return 0;
+}
+
+static int kvm_timer_dying_cpu(unsigned int cpu)
+{
+       disable_percpu_irq(host_vtimer_irq);
+       return 0;
+}
+
+int kvm_timer_hyp_init(bool has_gic)
+{
+       struct arch_timer_kvm_info *info;
+       int err;
+
+       info = arch_timer_get_kvm_info();
+       timecounter = &info->timecounter;
+
+       if (!timecounter->cc) {
+               kvm_err("kvm_arch_timer: uninitialized timecounter\n");
+               return -ENODEV;
+       }
+
+       /* First, do the virtual EL1 timer irq */
+
+       if (info->virtual_irq <= 0) {
+               kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
+                       info->virtual_irq);
+               return -ENODEV;
+       }
+       host_vtimer_irq = info->virtual_irq;
+
+       host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
+       if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
+           host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
+               kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n",
+                       host_vtimer_irq);
+               host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
+       }
+
+       err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
+                                "kvm guest vtimer", kvm_get_running_vcpus());
+       if (err) {
+               kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n",
+                       host_vtimer_irq, err);
+               return err;
+       }
+
+       if (has_gic) {
+               err = irq_set_vcpu_affinity(host_vtimer_irq,
+                                           kvm_get_running_vcpus());
+               if (err) {
+                       kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
+                       goto out_free_irq;
+               }
+
+               static_branch_enable(&has_gic_active_state);
+       }
+
+       kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq);
+
+       /* Now let's do the physical EL1 timer irq */
+
+       if (info->physical_irq > 0) {
+               host_ptimer_irq = info->physical_irq;
+               host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq);
+               if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH &&
+                   host_ptimer_irq_flags != IRQF_TRIGGER_LOW) {
+                       kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n",
+                               host_ptimer_irq);
+                       host_ptimer_irq_flags = IRQF_TRIGGER_LOW;
+               }
+
+               err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
+                                        "kvm guest ptimer", kvm_get_running_vcpus());
+               if (err) {
+                       kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
+                               host_ptimer_irq, err);
+                       return err;
+               }
+
+               if (has_gic) {
+                       err = irq_set_vcpu_affinity(host_ptimer_irq,
+                                                   kvm_get_running_vcpus());
+                       if (err) {
+                               kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
+                               goto out_free_irq;
+                       }
+               }
+
+               kvm_debug("physical timer IRQ%d\n", host_ptimer_irq);
+       } else if (has_vhe()) {
+               kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
+                       info->physical_irq);
+               err = -ENODEV;
+               goto out_free_irq;
+       }
+
+       cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
+                         "kvm/arm/timer:starting", kvm_timer_starting_cpu,
+                         kvm_timer_dying_cpu);
+       return 0;
+out_free_irq:
+       free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
+       return err;
+}
+
+void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+
+       soft_timer_cancel(&timer->bg_timer);
+}
+
+static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
+{
+       int vtimer_irq, ptimer_irq;
+       int i, ret;
+
+       vtimer_irq = vcpu_vtimer(vcpu)->irq.irq;
+       ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
+       if (ret)
+               return false;
+
+       ptimer_irq = vcpu_ptimer(vcpu)->irq.irq;
+       ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
+       if (ret)
+               return false;
+
+       kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
+               if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq ||
+                   vcpu_ptimer(vcpu)->irq.irq != ptimer_irq)
+                       return false;
+       }
+
+       return true;
+}
+
+bool kvm_arch_timer_get_input_level(int vintid)
+{
+       struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+       struct arch_timer_context *timer;
+
+       if (vintid == vcpu_vtimer(vcpu)->irq.irq)
+               timer = vcpu_vtimer(vcpu);
+       else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
+               timer = vcpu_ptimer(vcpu);
+       else
+               BUG();
+
+       return kvm_timer_should_fire(timer);
+}
+
+int kvm_timer_enable(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+       struct timer_map map;
+       int ret;
+
+       if (timer->enabled)
+               return 0;
+
+       /* Without a VGIC we do not map virtual IRQs to physical IRQs */
+       if (!irqchip_in_kernel(vcpu->kvm))
+               goto no_vgic;
+
+       if (!vgic_initialized(vcpu->kvm))
+               return -ENODEV;
+
+       if (!timer_irqs_are_valid(vcpu)) {
+               kvm_debug("incorrectly configured timer irqs\n");
+               return -EINVAL;
+       }
+
+       get_timer_map(vcpu, &map);
+
+       ret = kvm_vgic_map_phys_irq(vcpu,
+                                   map.direct_vtimer->host_timer_irq,
+                                   map.direct_vtimer->irq.irq,
+                                   kvm_arch_timer_get_input_level);
+       if (ret)
+               return ret;
+
+       if (map.direct_ptimer) {
+               ret = kvm_vgic_map_phys_irq(vcpu,
+                                           map.direct_ptimer->host_timer_irq,
+                                           map.direct_ptimer->irq.irq,
+                                           kvm_arch_timer_get_input_level);
+       }
+
+       if (ret)
+               return ret;
+
+no_vgic:
+       timer->enabled = 1;
+       return 0;
+}
+
+/*
+ * On VHE system, we only need to configure the EL2 timer trap register once,
+ * not for every world switch.
+ * The host kernel runs at EL2 with HCR_EL2.TGE == 1,
+ * and this makes those bits have no effect for the host kernel execution.
+ */
+void kvm_timer_init_vhe(void)
+{
+       /* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */
+       u32 cnthctl_shift = 10;
+       u64 val;
+
+       /*
+        * VHE systems allow the guest direct access to the EL1 physical
+        * timer/counter.
+        */
+       val = read_sysreg(cnthctl_el2);
+       val |= (CNTHCTL_EL1PCEN << cnthctl_shift);
+       val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
+       write_sysreg(val, cnthctl_el2);
+}
+
+static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
+{
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               vcpu_vtimer(vcpu)->irq.irq = vtimer_irq;
+               vcpu_ptimer(vcpu)->irq.irq = ptimer_irq;
+       }
+}
+
+int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+       int __user *uaddr = (int __user *)(long)attr->addr;
+       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+       struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+       int irq;
+
+       if (!irqchip_in_kernel(vcpu->kvm))
+               return -EINVAL;
+
+       if (get_user(irq, uaddr))
+               return -EFAULT;
+
+       if (!(irq_is_ppi(irq)))
+               return -EINVAL;
+
+       if (vcpu->arch.timer_cpu.enabled)
+               return -EBUSY;
+
+       switch (attr->attr) {
+       case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+               set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq);
+               break;
+       case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+               set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq);
+               break;
+       default:
+               return -ENXIO;
+       }
+
+       return 0;
+}
+
+int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+       int __user *uaddr = (int __user *)(long)attr->addr;
+       struct arch_timer_context *timer;
+       int irq;
+
+       switch (attr->attr) {
+       case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+               timer = vcpu_vtimer(vcpu);
+               break;
+       case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+               timer = vcpu_ptimer(vcpu);
+               break;
+       default:
+               return -ENXIO;
+       }
+
+       irq = timer->irq.irq;
+       return put_user(irq, uaddr);
+}
+
+int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+       switch (attr->attr) {
+       case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+       case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+               return 0;
+       }
+
+       return -ENXIO;
+}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
new file mode 100644 (file)
index 0000000..c958bb3
--- /dev/null
@@ -0,0 +1,1681 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/bug.h>
+#include <linux/cpu_pm.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/kvm.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
+#include <linux/sched/stat.h>
+#include <trace/events/kvm.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace_arm.h"
+
+#include <linux/uaccess.h>
+#include <asm/ptrace.h>
+#include <asm/mman.h>
+#include <asm/tlbflush.h>
+#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
+#include <asm/virt.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_coproc.h>
+#include <asm/sections.h>
+
+#include <kvm/arm_hypercalls.h>
+#include <kvm/arm_pmu.h>
+#include <kvm/arm_psci.h>
+
+#ifdef REQUIRES_VIRT
+__asm__(".arch_extension       virt");
+#endif
+
+DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
+static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+
+/* The VMID used in the VTTBR */
+static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
+static u32 kvm_next_vmid;
+static DEFINE_SPINLOCK(kvm_vmid_lock);
+
+static bool vgic_present;
+
+static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
+DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
+
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+       return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
+}
+
+int kvm_arch_hardware_setup(void *opaque)
+{
+       return 0;
+}
+
+int kvm_arch_check_processor_compat(void *opaque)
+{
+       return 0;
+}
+
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+                           struct kvm_enable_cap *cap)
+{
+       int r;
+
+       if (cap->flags)
+               return -EINVAL;
+
+       switch (cap->cap) {
+       case KVM_CAP_ARM_NISV_TO_USER:
+               r = 0;
+               kvm->arch.return_nisv_io_abort_to_user = true;
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+
+       return r;
+}
+
+/**
+ * kvm_arch_init_vm - initializes a VM data structure
+ * @kvm:       pointer to the KVM struct
+ */
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+       int ret, cpu;
+
+       ret = kvm_arm_setup_stage2(kvm, type);
+       if (ret)
+               return ret;
+
+       kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran));
+       if (!kvm->arch.last_vcpu_ran)
+               return -ENOMEM;
+
+       for_each_possible_cpu(cpu)
+               *per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1;
+
+       ret = kvm_alloc_stage2_pgd(kvm);
+       if (ret)
+               goto out_fail_alloc;
+
+       ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
+       if (ret)
+               goto out_free_stage2_pgd;
+
+       kvm_vgic_early_init(kvm);
+
+       /* Mark the initial VMID generation invalid */
+       kvm->arch.vmid.vmid_gen = 0;
+
+       /* The maximum number of VCPUs is limited by the host's GIC model */
+       kvm->arch.max_vcpus = vgic_present ?
+                               kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
+
+       return ret;
+out_free_stage2_pgd:
+       kvm_free_stage2_pgd(kvm);
+out_fail_alloc:
+       free_percpu(kvm->arch.last_vcpu_ran);
+       kvm->arch.last_vcpu_ran = NULL;
+       return ret;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+       return VM_FAULT_SIGBUS;
+}
+
+
+/**
+ * kvm_arch_destroy_vm - destroy the VM data structure
+ * @kvm:       pointer to the KVM struct
+ */
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+       int i;
+
+       kvm_vgic_destroy(kvm);
+
+       free_percpu(kvm->arch.last_vcpu_ran);
+       kvm->arch.last_vcpu_ran = NULL;
+
+       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+               if (kvm->vcpus[i]) {
+                       kvm_vcpu_destroy(kvm->vcpus[i]);
+                       kvm->vcpus[i] = NULL;
+               }
+       }
+       atomic_set(&kvm->online_vcpus, 0);
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+       int r;
+       switch (ext) {
+       case KVM_CAP_IRQCHIP:
+               r = vgic_present;
+               break;
+       case KVM_CAP_IOEVENTFD:
+       case KVM_CAP_DEVICE_CTRL:
+       case KVM_CAP_USER_MEMORY:
+       case KVM_CAP_SYNC_MMU:
+       case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+       case KVM_CAP_ONE_REG:
+       case KVM_CAP_ARM_PSCI:
+       case KVM_CAP_ARM_PSCI_0_2:
+       case KVM_CAP_READONLY_MEM:
+       case KVM_CAP_MP_STATE:
+       case KVM_CAP_IMMEDIATE_EXIT:
+       case KVM_CAP_VCPU_EVENTS:
+       case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
+       case KVM_CAP_ARM_NISV_TO_USER:
+       case KVM_CAP_ARM_INJECT_EXT_DABT:
+               r = 1;
+               break;
+       case KVM_CAP_ARM_SET_DEVICE_ADDR:
+               r = 1;
+               break;
+       case KVM_CAP_NR_VCPUS:
+               r = num_online_cpus();
+               break;
+       case KVM_CAP_MAX_VCPUS:
+               r = KVM_MAX_VCPUS;
+               break;
+       case KVM_CAP_MAX_VCPU_ID:
+               r = KVM_MAX_VCPU_ID;
+               break;
+       case KVM_CAP_MSI_DEVID:
+               if (!kvm)
+                       r = -EINVAL;
+               else
+                       r = kvm->arch.vgic.msis_require_devid;
+               break;
+       case KVM_CAP_ARM_USER_IRQ:
+               /*
+                * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
+                * (bump this number if adding more devices)
+                */
+               r = 1;
+               break;
+       default:
+               r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
+               break;
+       }
+       return r;
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+                       unsigned int ioctl, unsigned long arg)
+{
+       return -EINVAL;
+}
+
+struct kvm *kvm_arch_alloc_vm(void)
+{
+       if (!has_vhe())
+               return kzalloc(sizeof(struct kvm), GFP_KERNEL);
+
+       return vzalloc(sizeof(struct kvm));
+}
+
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+       if (!has_vhe())
+               kfree(kvm);
+       else
+               vfree(kvm);
+}
+
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
+{
+       if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
+               return -EBUSY;
+
+       if (id >= kvm->arch.max_vcpus)
+               return -EINVAL;
+
+       return 0;
+}
+
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
+{
+       int err;
+
+       /* Force users to call KVM_ARM_VCPU_INIT */
+       vcpu->arch.target = -1;
+       bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+
+       /* Set up the timer */
+       kvm_timer_vcpu_init(vcpu);
+
+       kvm_pmu_vcpu_init(vcpu);
+
+       kvm_arm_reset_debug_ptr(vcpu);
+
+       kvm_arm_pvtime_vcpu_init(&vcpu->arch);
+
+       err = kvm_vgic_vcpu_init(vcpu);
+       if (err)
+               return err;
+
+       return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
+}
+
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
+               static_branch_dec(&userspace_irqchip_in_use);
+
+       kvm_mmu_free_memory_caches(vcpu);
+       kvm_timer_vcpu_terminate(vcpu);
+       kvm_pmu_vcpu_destroy(vcpu);
+
+       kvm_arm_vcpu_destroy(vcpu);
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+       return kvm_timer_is_pending(vcpu);
+}
+
+void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+       /*
+        * If we're about to block (most likely because we've just hit a
+        * WFI), we need to sync back the state of the GIC CPU interface
+        * so that we have the latest PMR and group enables. This ensures
+        * that kvm_arch_vcpu_runnable has up-to-date data to decide
+        * whether we have pending interrupts.
+        *
+        * For the same reason, we want to tell GICv4 that we need
+        * doorbells to be signalled, should an interrupt become pending.
+        */
+       preempt_disable();
+       kvm_vgic_vmcr_sync(vcpu);
+       vgic_v4_put(vcpu, true);
+       preempt_enable();
+}
+
+void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+       preempt_disable();
+       vgic_v4_load(vcpu);
+       preempt_enable();
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       int *last_ran;
+       kvm_host_data_t *cpu_data;
+
+       last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran);
+       cpu_data = this_cpu_ptr(&kvm_host_data);
+
+       /*
+        * We might get preempted before the vCPU actually runs, but
+        * over-invalidation doesn't affect correctness.
+        */
+       if (*last_ran != vcpu->vcpu_id) {
+               kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu);
+               *last_ran = vcpu->vcpu_id;
+       }
+
+       vcpu->cpu = cpu;
+       vcpu->arch.host_cpu_context = &cpu_data->host_ctxt;
+
+       kvm_vgic_load(vcpu);
+       kvm_timer_vcpu_load(vcpu);
+       kvm_vcpu_load_sysregs(vcpu);
+       kvm_arch_vcpu_load_fp(vcpu);
+       kvm_vcpu_pmu_restore_guest(vcpu);
+       if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
+               kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
+
+       if (single_task_running())
+               vcpu_clear_wfx_traps(vcpu);
+       else
+               vcpu_set_wfx_traps(vcpu);
+
+       vcpu_ptrauth_setup_lazy(vcpu);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       kvm_arch_vcpu_put_fp(vcpu);
+       kvm_vcpu_put_sysregs(vcpu);
+       kvm_timer_vcpu_put(vcpu);
+       kvm_vgic_put(vcpu);
+       kvm_vcpu_pmu_restore_host(vcpu);
+
+       vcpu->cpu = -1;
+}
+
+static void vcpu_power_off(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.power_off = true;
+       kvm_make_request(KVM_REQ_SLEEP, vcpu);
+       kvm_vcpu_kick(vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+                                   struct kvm_mp_state *mp_state)
+{
+       if (vcpu->arch.power_off)
+               mp_state->mp_state = KVM_MP_STATE_STOPPED;
+       else
+               mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+                                   struct kvm_mp_state *mp_state)
+{
+       int ret = 0;
+
+       switch (mp_state->mp_state) {
+       case KVM_MP_STATE_RUNNABLE:
+               vcpu->arch.power_off = false;
+               break;
+       case KVM_MP_STATE_STOPPED:
+               vcpu_power_off(vcpu);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+/**
+ * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
+ * @v:         The VCPU pointer
+ *
+ * If the guest CPU is not waiting for interrupts or an interrupt line is
+ * asserted, the CPU is by definition runnable.
+ */
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
+{
+       bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
+       return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
+               && !v->arch.power_off && !v->arch.pause);
+}
+
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+       return vcpu_mode_priv(vcpu);
+}
+
+/* Just ensure a guest exit from a particular CPU */
+static void exit_vm_noop(void *info)
+{
+}
+
+void force_vm_exit(const cpumask_t *mask)
+{
+       preempt_disable();
+       smp_call_function_many(mask, exit_vm_noop, NULL, true);
+       preempt_enable();
+}
+
+/**
+ * need_new_vmid_gen - check that the VMID is still valid
+ * @vmid: The VMID to check
+ *
+ * return true if there is a new generation of VMIDs being used
+ *
+ * The hardware supports a limited set of values with the value zero reserved
+ * for the host, so we check if an assigned value belongs to a previous
+ * generation, which which requires us to assign a new value. If we're the
+ * first to use a VMID for the new generation, we must flush necessary caches
+ * and TLBs on all CPUs.
+ */
+static bool need_new_vmid_gen(struct kvm_vmid *vmid)
+{
+       u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen);
+       smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
+       return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen);
+}
+
+/**
+ * update_vmid - Update the vmid with a valid VMID for the current generation
+ * @kvm: The guest that struct vmid belongs to
+ * @vmid: The stage-2 VMID information struct
+ */
+static void update_vmid(struct kvm_vmid *vmid)
+{
+       if (!need_new_vmid_gen(vmid))
+               return;
+
+       spin_lock(&kvm_vmid_lock);
+
+       /*
+        * We need to re-check the vmid_gen here to ensure that if another vcpu
+        * already allocated a valid vmid for this vm, then this vcpu should
+        * use the same vmid.
+        */
+       if (!need_new_vmid_gen(vmid)) {
+               spin_unlock(&kvm_vmid_lock);
+               return;
+       }
+
+       /* First user of a new VMID generation? */
+       if (unlikely(kvm_next_vmid == 0)) {
+               atomic64_inc(&kvm_vmid_gen);
+               kvm_next_vmid = 1;
+
+               /*
+                * On SMP we know no other CPUs can use this CPU's or each
+                * other's VMID after force_vm_exit returns since the
+                * kvm_vmid_lock blocks them from reentry to the guest.
+                */
+               force_vm_exit(cpu_all_mask);
+               /*
+                * Now broadcast TLB + ICACHE invalidation over the inner
+                * shareable domain to make sure all data structures are
+                * clean.
+                */
+               kvm_call_hyp(__kvm_flush_vm_context);
+       }
+
+       vmid->vmid = kvm_next_vmid;
+       kvm_next_vmid++;
+       kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1;
+
+       smp_wmb();
+       WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen));
+
+       spin_unlock(&kvm_vmid_lock);
+}
+
+static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       int ret = 0;
+
+       if (likely(vcpu->arch.has_run_once))
+               return 0;
+
+       if (!kvm_arm_vcpu_is_finalized(vcpu))
+               return -EPERM;
+
+       vcpu->arch.has_run_once = true;
+
+       if (likely(irqchip_in_kernel(kvm))) {
+               /*
+                * Map the VGIC hardware resources before running a vcpu the
+                * first time on this VM.
+                */
+               if (unlikely(!vgic_ready(kvm))) {
+                       ret = kvm_vgic_map_resources(kvm);
+                       if (ret)
+                               return ret;
+               }
+       } else {
+               /*
+                * Tell the rest of the code that there are userspace irqchip
+                * VMs in the wild.
+                */
+               static_branch_inc(&userspace_irqchip_in_use);
+       }
+
+       ret = kvm_timer_enable(vcpu);
+       if (ret)
+               return ret;
+
+       ret = kvm_arm_pmu_v3_enable(vcpu);
+
+       return ret;
+}
+
+bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+       return vgic_initialized(kvm);
+}
+
+void kvm_arm_halt_guest(struct kvm *kvm)
+{
+       int i;
+       struct kvm_vcpu *vcpu;
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               vcpu->arch.pause = true;
+       kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
+}
+
+void kvm_arm_resume_guest(struct kvm *kvm)
+{
+       int i;
+       struct kvm_vcpu *vcpu;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               vcpu->arch.pause = false;
+               swake_up_one(kvm_arch_vcpu_wq(vcpu));
+       }
+}
+
+static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
+{
+       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
+
+       swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
+                                      (!vcpu->arch.pause)));
+
+       if (vcpu->arch.power_off || vcpu->arch.pause) {
+               /* Awaken to handle a signal, request we sleep again later. */
+               kvm_make_request(KVM_REQ_SLEEP, vcpu);
+       }
+
+       /*
+        * Make sure we will observe a potential reset request if we've
+        * observed a change to the power state. Pairs with the smp_wmb() in
+        * kvm_psci_vcpu_on().
+        */
+       smp_rmb();
+}
+
+static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.target >= 0;
+}
+
+static void check_vcpu_requests(struct kvm_vcpu *vcpu)
+{
+       if (kvm_request_pending(vcpu)) {
+               if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
+                       vcpu_req_sleep(vcpu);
+
+               if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
+                       kvm_reset_vcpu(vcpu);
+
+               /*
+                * Clear IRQ_PENDING requests that were made to guarantee
+                * that a VCPU sees new virtual interrupts.
+                */
+               kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
+
+               if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
+                       kvm_update_stolen_time(vcpu);
+
+               if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
+                       /* The distributor enable bits were changed */
+                       preempt_disable();
+                       vgic_v4_put(vcpu, false);
+                       vgic_v4_load(vcpu);
+                       preempt_enable();
+               }
+       }
+}
+
+/**
+ * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
+ * @vcpu:      The VCPU pointer
+ * @run:       The kvm_run structure pointer used for userspace state exchange
+ *
+ * This function is called through the VCPU_RUN ioctl called from user space. It
+ * will execute VM code in a loop until the time slice for the process is used
+ * or some emulation is needed from user space in which case the function will
+ * return with return value 0 and with the kvm_run structure filled in with the
+ * required data for the requested emulation.
+ */
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+       int ret;
+
+       if (unlikely(!kvm_vcpu_initialized(vcpu)))
+               return -ENOEXEC;
+
+       ret = kvm_vcpu_first_run_init(vcpu);
+       if (ret)
+               return ret;
+
+       if (run->exit_reason == KVM_EXIT_MMIO) {
+               ret = kvm_handle_mmio_return(vcpu, vcpu->run);
+               if (ret)
+                       return ret;
+       }
+
+       if (run->immediate_exit)
+               return -EINTR;
+
+       vcpu_load(vcpu);
+
+       kvm_sigset_activate(vcpu);
+
+       ret = 1;
+       run->exit_reason = KVM_EXIT_UNKNOWN;
+       while (ret > 0) {
+               /*
+                * Check conditions before entering the guest
+                */
+               cond_resched();
+
+               update_vmid(&vcpu->kvm->arch.vmid);
+
+               check_vcpu_requests(vcpu);
+
+               /*
+                * Preparing the interrupts to be injected also
+                * involves poking the GIC, which must be done in a
+                * non-preemptible context.
+                */
+               preempt_disable();
+
+               kvm_pmu_flush_hwstate(vcpu);
+
+               local_irq_disable();
+
+               kvm_vgic_flush_hwstate(vcpu);
+
+               /*
+                * Exit if we have a signal pending so that we can deliver the
+                * signal to user space.
+                */
+               if (signal_pending(current)) {
+                       ret = -EINTR;
+                       run->exit_reason = KVM_EXIT_INTR;
+               }
+
+               /*
+                * If we're using a userspace irqchip, then check if we need
+                * to tell a userspace irqchip about timer or PMU level
+                * changes and if so, exit to userspace (the actual level
+                * state gets updated in kvm_timer_update_run and
+                * kvm_pmu_update_run below).
+                */
+               if (static_branch_unlikely(&userspace_irqchip_in_use)) {
+                       if (kvm_timer_should_notify_user(vcpu) ||
+                           kvm_pmu_should_notify_user(vcpu)) {
+                               ret = -EINTR;
+                               run->exit_reason = KVM_EXIT_INTR;
+                       }
+               }
+
+               /*
+                * Ensure we set mode to IN_GUEST_MODE after we disable
+                * interrupts and before the final VCPU requests check.
+                * See the comment in kvm_vcpu_exiting_guest_mode() and
+                * Documentation/virt/kvm/vcpu-requests.rst
+                */
+               smp_store_mb(vcpu->mode, IN_GUEST_MODE);
+
+               if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) ||
+                   kvm_request_pending(vcpu)) {
+                       vcpu->mode = OUTSIDE_GUEST_MODE;
+                       isb(); /* Ensure work in x_flush_hwstate is committed */
+                       kvm_pmu_sync_hwstate(vcpu);
+                       if (static_branch_unlikely(&userspace_irqchip_in_use))
+                               kvm_timer_sync_hwstate(vcpu);
+                       kvm_vgic_sync_hwstate(vcpu);
+                       local_irq_enable();
+                       preempt_enable();
+                       continue;
+               }
+
+               kvm_arm_setup_debug(vcpu);
+
+               /**************************************************************
+                * Enter the guest
+                */
+               trace_kvm_entry(*vcpu_pc(vcpu));
+               guest_enter_irqoff();
+
+               if (has_vhe()) {
+                       ret = kvm_vcpu_run_vhe(vcpu);
+               } else {
+                       ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu);
+               }
+
+               vcpu->mode = OUTSIDE_GUEST_MODE;
+               vcpu->stat.exits++;
+               /*
+                * Back from guest
+                *************************************************************/
+
+               kvm_arm_clear_debug(vcpu);
+
+               /*
+                * We must sync the PMU state before the vgic state so
+                * that the vgic can properly sample the updated state of the
+                * interrupt line.
+                */
+               kvm_pmu_sync_hwstate(vcpu);
+
+               /*
+                * Sync the vgic state before syncing the timer state because
+                * the timer code needs to know if the virtual timer
+                * interrupts are active.
+                */
+               kvm_vgic_sync_hwstate(vcpu);
+
+               /*
+                * Sync the timer hardware state before enabling interrupts as
+                * we don't want vtimer interrupts to race with syncing the
+                * timer virtual interrupt state.
+                */
+               if (static_branch_unlikely(&userspace_irqchip_in_use))
+                       kvm_timer_sync_hwstate(vcpu);
+
+               kvm_arch_vcpu_ctxsync_fp(vcpu);
+
+               /*
+                * We may have taken a host interrupt in HYP mode (ie
+                * while executing the guest). This interrupt is still
+                * pending, as we haven't serviced it yet!
+                *
+                * We're now back in SVC mode, with interrupts
+                * disabled.  Enabling the interrupts now will have
+                * the effect of taking the interrupt again, in SVC
+                * mode this time.
+                */
+               local_irq_enable();
+
+               /*
+                * We do local_irq_enable() before calling guest_exit() so
+                * that if a timer interrupt hits while running the guest we
+                * account that tick as being spent in the guest.  We enable
+                * preemption after calling guest_exit() so that if we get
+                * preempted we make sure ticks after that is not counted as
+                * guest time.
+                */
+               guest_exit();
+               trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
+
+               /* Exit types that need handling before we can be preempted */
+               handle_exit_early(vcpu, run, ret);
+
+               preempt_enable();
+
+               ret = handle_exit(vcpu, run, ret);
+       }
+
+       /* Tell userspace about in-kernel device output levels */
+       if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
+               kvm_timer_update_run(vcpu);
+               kvm_pmu_update_run(vcpu);
+       }
+
+       kvm_sigset_deactivate(vcpu);
+
+       vcpu_put(vcpu);
+       return ret;
+}
+
+static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
+{
+       int bit_index;
+       bool set;
+       unsigned long *hcr;
+
+       if (number == KVM_ARM_IRQ_CPU_IRQ)
+               bit_index = __ffs(HCR_VI);
+       else /* KVM_ARM_IRQ_CPU_FIQ */
+               bit_index = __ffs(HCR_VF);
+
+       hcr = vcpu_hcr(vcpu);
+       if (level)
+               set = test_and_set_bit(bit_index, hcr);
+       else
+               set = test_and_clear_bit(bit_index, hcr);
+
+       /*
+        * If we didn't change anything, no need to wake up or kick other CPUs
+        */
+       if (set == level)
+               return 0;
+
+       /*
+        * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
+        * trigger a world-switch round on the running physical CPU to set the
+        * virtual IRQ/FIQ fields in the HCR appropriately.
+        */
+       kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+       kvm_vcpu_kick(vcpu);
+
+       return 0;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
+                         bool line_status)
+{
+       u32 irq = irq_level->irq;
+       unsigned int irq_type, vcpu_idx, irq_num;
+       int nrcpus = atomic_read(&kvm->online_vcpus);
+       struct kvm_vcpu *vcpu = NULL;
+       bool level = irq_level->level;
+
+       irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
+       vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
+       vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
+       irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
+
+       trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
+
+       switch (irq_type) {
+       case KVM_ARM_IRQ_TYPE_CPU:
+               if (irqchip_in_kernel(kvm))
+                       return -ENXIO;
+
+               if (vcpu_idx >= nrcpus)
+                       return -EINVAL;
+
+               vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+               if (!vcpu)
+                       return -EINVAL;
+
+               if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
+                       return -EINVAL;
+
+               return vcpu_interrupt_line(vcpu, irq_num, level);
+       case KVM_ARM_IRQ_TYPE_PPI:
+               if (!irqchip_in_kernel(kvm))
+                       return -ENXIO;
+
+               if (vcpu_idx >= nrcpus)
+                       return -EINVAL;
+
+               vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+               if (!vcpu)
+                       return -EINVAL;
+
+               if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
+                       return -EINVAL;
+
+               return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
+       case KVM_ARM_IRQ_TYPE_SPI:
+               if (!irqchip_in_kernel(kvm))
+                       return -ENXIO;
+
+               if (irq_num < VGIC_NR_PRIVATE_IRQS)
+                       return -EINVAL;
+
+               return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
+       }
+
+       return -EINVAL;
+}
+
+static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
+                              const struct kvm_vcpu_init *init)
+{
+       unsigned int i, ret;
+       int phys_target = kvm_target_cpu();
+
+       if (init->target != phys_target)
+               return -EINVAL;
+
+       /*
+        * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
+        * use the same target.
+        */
+       if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
+               return -EINVAL;
+
+       /* -ENOENT for unknown features, -EINVAL for invalid combinations. */
+       for (i = 0; i < sizeof(init->features) * 8; i++) {
+               bool set = (init->features[i / 32] & (1 << (i % 32)));
+
+               if (set && i >= KVM_VCPU_MAX_FEATURES)
+                       return -ENOENT;
+
+               /*
+                * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
+                * use the same feature set.
+                */
+               if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
+                   test_bit(i, vcpu->arch.features) != set)
+                       return -EINVAL;
+
+               if (set)
+                       set_bit(i, vcpu->arch.features);
+       }
+
+       vcpu->arch.target = phys_target;
+
+       /* Now we know what it is, we can reset it. */
+       ret = kvm_reset_vcpu(vcpu);
+       if (ret) {
+               vcpu->arch.target = -1;
+               bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+       }
+
+       return ret;
+}
+
+static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
+                                        struct kvm_vcpu_init *init)
+{
+       int ret;
+
+       ret = kvm_vcpu_set_target(vcpu, init);
+       if (ret)
+               return ret;
+
+       /*
+        * Ensure a rebooted VM will fault in RAM pages and detect if the
+        * guest MMU is turned off and flush the caches as needed.
+        */
+       if (vcpu->arch.has_run_once)
+               stage2_unmap_vm(vcpu->kvm);
+
+       vcpu_reset_hcr(vcpu);
+
+       /*
+        * Handle the "start in power-off" case.
+        */
+       if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
+               vcpu_power_off(vcpu);
+       else
+               vcpu->arch.power_off = false;
+
+       return 0;
+}
+
+static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
+                                struct kvm_device_attr *attr)
+{
+       int ret = -ENXIO;
+
+       switch (attr->group) {
+       default:
+               ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
+               break;
+       }
+
+       return ret;
+}
+
+static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
+                                struct kvm_device_attr *attr)
+{
+       int ret = -ENXIO;
+
+       switch (attr->group) {
+       default:
+               ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
+               break;
+       }
+
+       return ret;
+}
+
+static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
+                                struct kvm_device_attr *attr)
+{
+       int ret = -ENXIO;
+
+       switch (attr->group) {
+       default:
+               ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
+               break;
+       }
+
+       return ret;
+}
+
+static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
+                                  struct kvm_vcpu_events *events)
+{
+       memset(events, 0, sizeof(*events));
+
+       return __kvm_arm_vcpu_get_events(vcpu, events);
+}
+
+static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
+                                  struct kvm_vcpu_events *events)
+{
+       int i;
+
+       /* check whether the reserved field is zero */
+       for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
+               if (events->reserved[i])
+                       return -EINVAL;
+
+       /* check whether the pad field is zero */
+       for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
+               if (events->exception.pad[i])
+                       return -EINVAL;
+
+       return __kvm_arm_vcpu_set_events(vcpu, events);
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                        unsigned int ioctl, unsigned long arg)
+{
+       struct kvm_vcpu *vcpu = filp->private_data;
+       void __user *argp = (void __user *)arg;
+       struct kvm_device_attr attr;
+       long r;
+
+       switch (ioctl) {
+       case KVM_ARM_VCPU_INIT: {
+               struct kvm_vcpu_init init;
+
+               r = -EFAULT;
+               if (copy_from_user(&init, argp, sizeof(init)))
+                       break;
+
+               r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
+               break;
+       }
+       case KVM_SET_ONE_REG:
+       case KVM_GET_ONE_REG: {
+               struct kvm_one_reg reg;
+
+               r = -ENOEXEC;
+               if (unlikely(!kvm_vcpu_initialized(vcpu)))
+                       break;
+
+               r = -EFAULT;
+               if (copy_from_user(&reg, argp, sizeof(reg)))
+                       break;
+
+               if (ioctl == KVM_SET_ONE_REG)
+                       r = kvm_arm_set_reg(vcpu, &reg);
+               else
+                       r = kvm_arm_get_reg(vcpu, &reg);
+               break;
+       }
+       case KVM_GET_REG_LIST: {
+               struct kvm_reg_list __user *user_list = argp;
+               struct kvm_reg_list reg_list;
+               unsigned n;
+
+               r = -ENOEXEC;
+               if (unlikely(!kvm_vcpu_initialized(vcpu)))
+                       break;
+
+               r = -EPERM;
+               if (!kvm_arm_vcpu_is_finalized(vcpu))
+                       break;
+
+               r = -EFAULT;
+               if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
+                       break;
+               n = reg_list.n;
+               reg_list.n = kvm_arm_num_regs(vcpu);
+               if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
+                       break;
+               r = -E2BIG;
+               if (n < reg_list.n)
+                       break;
+               r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
+               break;
+       }
+       case KVM_SET_DEVICE_ATTR: {
+               r = -EFAULT;
+               if (copy_from_user(&attr, argp, sizeof(attr)))
+                       break;
+               r = kvm_arm_vcpu_set_attr(vcpu, &attr);
+               break;
+       }
+       case KVM_GET_DEVICE_ATTR: {
+               r = -EFAULT;
+               if (copy_from_user(&attr, argp, sizeof(attr)))
+                       break;
+               r = kvm_arm_vcpu_get_attr(vcpu, &attr);
+               break;
+       }
+       case KVM_HAS_DEVICE_ATTR: {
+               r = -EFAULT;
+               if (copy_from_user(&attr, argp, sizeof(attr)))
+                       break;
+               r = kvm_arm_vcpu_has_attr(vcpu, &attr);
+               break;
+       }
+       case KVM_GET_VCPU_EVENTS: {
+               struct kvm_vcpu_events events;
+
+               if (kvm_arm_vcpu_get_events(vcpu, &events))
+                       return -EINVAL;
+
+               if (copy_to_user(argp, &events, sizeof(events)))
+                       return -EFAULT;
+
+               return 0;
+       }
+       case KVM_SET_VCPU_EVENTS: {
+               struct kvm_vcpu_events events;
+
+               if (copy_from_user(&events, argp, sizeof(events)))
+                       return -EFAULT;
+
+               return kvm_arm_vcpu_set_events(vcpu, &events);
+       }
+       case KVM_ARM_VCPU_FINALIZE: {
+               int what;
+
+               if (!kvm_vcpu_initialized(vcpu))
+                       return -ENOEXEC;
+
+               if (get_user(what, (const int __user *)argp))
+                       return -EFAULT;
+
+               return kvm_arm_vcpu_finalize(vcpu, what);
+       }
+       default:
+               r = -EINVAL;
+       }
+
+       return r;
+}
+
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+
+}
+
+void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
+                                       struct kvm_memory_slot *memslot)
+{
+       kvm_flush_remote_tlbs(kvm);
+}
+
+static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
+                                       struct kvm_arm_device_addr *dev_addr)
+{
+       unsigned long dev_id, type;
+
+       dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >>
+               KVM_ARM_DEVICE_ID_SHIFT;
+       type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >>
+               KVM_ARM_DEVICE_TYPE_SHIFT;
+
+       switch (dev_id) {
+       case KVM_ARM_DEVICE_VGIC_V2:
+               if (!vgic_present)
+                       return -ENXIO;
+               return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
+       default:
+               return -ENODEV;
+       }
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+                      unsigned int ioctl, unsigned long arg)
+{
+       struct kvm *kvm = filp->private_data;
+       void __user *argp = (void __user *)arg;
+
+       switch (ioctl) {
+       case KVM_CREATE_IRQCHIP: {
+               int ret;
+               if (!vgic_present)
+                       return -ENXIO;
+               mutex_lock(&kvm->lock);
+               ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+               mutex_unlock(&kvm->lock);
+               return ret;
+       }
+       case KVM_ARM_SET_DEVICE_ADDR: {
+               struct kvm_arm_device_addr dev_addr;
+
+               if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
+                       return -EFAULT;
+               return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
+       }
+       case KVM_ARM_PREFERRED_TARGET: {
+               int err;
+               struct kvm_vcpu_init init;
+
+               err = kvm_vcpu_preferred_target(&init);
+               if (err)
+                       return err;
+
+               if (copy_to_user(argp, &init, sizeof(init)))
+                       return -EFAULT;
+
+               return 0;
+       }
+       default:
+               return -EINVAL;
+       }
+}
+
+static void cpu_init_hyp_mode(void)
+{
+       phys_addr_t pgd_ptr;
+       unsigned long hyp_stack_ptr;
+       unsigned long stack_page;
+       unsigned long vector_ptr;
+
+       /* Switch from the HYP stub to our own HYP init vector */
+       __hyp_set_vectors(kvm_get_idmap_vector());
+
+       pgd_ptr = kvm_mmu_get_httbr();
+       stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
+       hyp_stack_ptr = stack_page + PAGE_SIZE;
+       vector_ptr = (unsigned long)kvm_get_hyp_vector();
+
+       __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
+       __cpu_init_stage2();
+}
+
+static void cpu_hyp_reset(void)
+{
+       if (!is_kernel_in_hyp_mode())
+               __hyp_reset_vectors();
+}
+
+static void cpu_hyp_reinit(void)
+{
+       kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt);
+
+       cpu_hyp_reset();
+
+       if (is_kernel_in_hyp_mode())
+               kvm_timer_init_vhe();
+       else
+               cpu_init_hyp_mode();
+
+       kvm_arm_init_debug();
+
+       if (vgic_present)
+               kvm_vgic_init_cpu_hardware();
+}
+
+static void _kvm_arch_hardware_enable(void *discard)
+{
+       if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
+               cpu_hyp_reinit();
+               __this_cpu_write(kvm_arm_hardware_enabled, 1);
+       }
+}
+
+int kvm_arch_hardware_enable(void)
+{
+       _kvm_arch_hardware_enable(NULL);
+       return 0;
+}
+
+static void _kvm_arch_hardware_disable(void *discard)
+{
+       if (__this_cpu_read(kvm_arm_hardware_enabled)) {
+               cpu_hyp_reset();
+               __this_cpu_write(kvm_arm_hardware_enabled, 0);
+       }
+}
+
+void kvm_arch_hardware_disable(void)
+{
+       _kvm_arch_hardware_disable(NULL);
+}
+
+#ifdef CONFIG_CPU_PM
+static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
+                                   unsigned long cmd,
+                                   void *v)
+{
+       /*
+        * kvm_arm_hardware_enabled is left with its old value over
+        * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
+        * re-enable hyp.
+        */
+       switch (cmd) {
+       case CPU_PM_ENTER:
+               if (__this_cpu_read(kvm_arm_hardware_enabled))
+                       /*
+                        * don't update kvm_arm_hardware_enabled here
+                        * so that the hardware will be re-enabled
+                        * when we resume. See below.
+                        */
+                       cpu_hyp_reset();
+
+               return NOTIFY_OK;
+       case CPU_PM_ENTER_FAILED:
+       case CPU_PM_EXIT:
+               if (__this_cpu_read(kvm_arm_hardware_enabled))
+                       /* The hardware was enabled before suspend. */
+                       cpu_hyp_reinit();
+
+               return NOTIFY_OK;
+
+       default:
+               return NOTIFY_DONE;
+       }
+}
+
+static struct notifier_block hyp_init_cpu_pm_nb = {
+       .notifier_call = hyp_init_cpu_pm_notifier,
+};
+
+static void __init hyp_cpu_pm_init(void)
+{
+       cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
+}
+static void __init hyp_cpu_pm_exit(void)
+{
+       cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
+}
+#else
+static inline void hyp_cpu_pm_init(void)
+{
+}
+static inline void hyp_cpu_pm_exit(void)
+{
+}
+#endif
+
+static int init_common_resources(void)
+{
+       kvm_set_ipa_limit();
+
+       return 0;
+}
+
+static int init_subsystems(void)
+{
+       int err = 0;
+
+       /*
+        * Enable hardware so that subsystem initialisation can access EL2.
+        */
+       on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
+
+       /*
+        * Register CPU lower-power notifier
+        */
+       hyp_cpu_pm_init();
+
+       /*
+        * Init HYP view of VGIC
+        */
+       err = kvm_vgic_hyp_init();
+       switch (err) {
+       case 0:
+               vgic_present = true;
+               break;
+       case -ENODEV:
+       case -ENXIO:
+               vgic_present = false;
+               err = 0;
+               break;
+       default:
+               goto out;
+       }
+
+       /*
+        * Init HYP architected timer support
+        */
+       err = kvm_timer_hyp_init(vgic_present);
+       if (err)
+               goto out;
+
+       kvm_perf_init();
+       kvm_coproc_table_init();
+
+out:
+       on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+
+       return err;
+}
+
+static void teardown_hyp_mode(void)
+{
+       int cpu;
+
+       free_hyp_pgds();
+       for_each_possible_cpu(cpu)
+               free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+}
+
+/**
+ * Inits Hyp-mode on all online CPUs
+ */
+static int init_hyp_mode(void)
+{
+       int cpu;
+       int err = 0;
+
+       /*
+        * Allocate Hyp PGD and setup Hyp identity mapping
+        */
+       err = kvm_mmu_init();
+       if (err)
+               goto out_err;
+
+       /*
+        * Allocate stack pages for Hypervisor-mode
+        */
+       for_each_possible_cpu(cpu) {
+               unsigned long stack_page;
+
+               stack_page = __get_free_page(GFP_KERNEL);
+               if (!stack_page) {
+                       err = -ENOMEM;
+                       goto out_err;
+               }
+
+               per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
+       }
+
+       /*
+        * Map the Hyp-code called directly from the host
+        */
+       err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
+                                 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
+       if (err) {
+               kvm_err("Cannot map world-switch code\n");
+               goto out_err;
+       }
+
+       err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
+                                 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
+       if (err) {
+               kvm_err("Cannot map rodata section\n");
+               goto out_err;
+       }
+
+       err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
+                                 kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
+       if (err) {
+               kvm_err("Cannot map bss section\n");
+               goto out_err;
+       }
+
+       err = kvm_map_vectors();
+       if (err) {
+               kvm_err("Cannot map vectors\n");
+               goto out_err;
+       }
+
+       /*
+        * Map the Hyp stack pages
+        */
+       for_each_possible_cpu(cpu) {
+               char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
+               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
+                                         PAGE_HYP);
+
+               if (err) {
+                       kvm_err("Cannot map hyp stack\n");
+                       goto out_err;
+               }
+       }
+
+       for_each_possible_cpu(cpu) {
+               kvm_host_data_t *cpu_data;
+
+               cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
+               err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
+
+               if (err) {
+                       kvm_err("Cannot map host CPU state: %d\n", err);
+                       goto out_err;
+               }
+       }
+
+       err = hyp_map_aux_data();
+       if (err)
+               kvm_err("Cannot map host auxiliary data: %d\n", err);
+
+       return 0;
+
+out_err:
+       teardown_hyp_mode();
+       kvm_err("error initializing Hyp mode: %d\n", err);
+       return err;
+}
+
+static void check_kvm_target_cpu(void *ret)
+{
+       *(int *)ret = kvm_target_cpu();
+}
+
+struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
+{
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       mpidr &= MPIDR_HWID_BITMASK;
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
+                       return vcpu;
+       }
+       return NULL;
+}
+
+bool kvm_arch_has_irq_bypass(void)
+{
+       return true;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+                                     struct irq_bypass_producer *prod)
+{
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+       return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
+                                         &irqfd->irq_entry);
+}
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+                                     struct irq_bypass_producer *prod)
+{
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+       kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
+                                    &irqfd->irq_entry);
+}
+
+void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
+{
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+       kvm_arm_halt_guest(irqfd->kvm);
+}
+
+void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
+{
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+       kvm_arm_resume_guest(irqfd->kvm);
+}
+
+/**
+ * Initialize Hyp-mode and memory mappings on all CPUs.
+ */
+int kvm_arch_init(void *opaque)
+{
+       int err;
+       int ret, cpu;
+       bool in_hyp_mode;
+
+       if (!is_hyp_mode_available()) {
+               kvm_info("HYP mode not available\n");
+               return -ENODEV;
+       }
+
+       in_hyp_mode = is_kernel_in_hyp_mode();
+
+       if (!in_hyp_mode && kvm_arch_requires_vhe()) {
+               kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
+               return -ENODEV;
+       }
+
+       for_each_online_cpu(cpu) {
+               smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
+               if (ret < 0) {
+                       kvm_err("Error, CPU %d not supported!\n", cpu);
+                       return -ENODEV;
+               }
+       }
+
+       err = init_common_resources();
+       if (err)
+               return err;
+
+       err = kvm_arm_init_sve();
+       if (err)
+               return err;
+
+       if (!in_hyp_mode) {
+               err = init_hyp_mode();
+               if (err)
+                       goto out_err;
+       }
+
+       err = init_subsystems();
+       if (err)
+               goto out_hyp;
+
+       if (in_hyp_mode)
+               kvm_info("VHE mode initialized successfully\n");
+       else
+               kvm_info("Hyp mode initialized successfully\n");
+
+       return 0;
+
+out_hyp:
+       hyp_cpu_pm_exit();
+       if (!in_hyp_mode)
+               teardown_hyp_mode();
+out_err:
+       return err;
+}
+
+/* NOP: Compiling as a module not supported */
+void kvm_arch_exit(void)
+{
+       kvm_perf_teardown();
+}
+
+static int arm_init(void)
+{
+       int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+       return rc;
+}
+
+module_init(arm_init);
index aacfc55de44cb90cc641186bbc1512f2652a6faf..eb194696ef62250535648be848e9b725105802d1 100644 (file)
@@ -23,7 +23,7 @@
 #include <kvm/arm_hypercalls.h>
 
 #define CREATE_TRACE_POINTS
-#include "trace.h"
+#include "trace_handle_exit.h"
 
 typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *);
 
index ea710f674cb6b4f41e55e9c261ee3be6c980b7c8..dc18274a68262fb0be6db719dfd0f834911b8e2c 100644 (file)
@@ -6,12 +6,9 @@
 ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING \
                $(DISABLE_STACKLEAK_PLUGIN)
 
-KVM=../../../../virt/kvm
-
-obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/aarch32.o
-
+obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += aarch32.o
 obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-cpuif-proxy.o
 obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c
new file mode 100644 (file)
index 0000000..25c0e47
--- /dev/null
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyp portion of the (not much of an) Emulation layer for 32bit guests.
+ *
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * based on arch/arm/kvm/emulate.c
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+/*
+ * stolen from arch/arm/kernel/opcodes.c
+ *
+ * condition code lookup table
+ * index into the table is test code: EQ, NE, ... LT, GT, AL, NV
+ *
+ * bit position in short is condition code: NZCV
+ */
+static const unsigned short cc_map[16] = {
+       0xF0F0,                 /* EQ == Z set            */
+       0x0F0F,                 /* NE                     */
+       0xCCCC,                 /* CS == C set            */
+       0x3333,                 /* CC                     */
+       0xFF00,                 /* MI == N set            */
+       0x00FF,                 /* PL                     */
+       0xAAAA,                 /* VS == V set            */
+       0x5555,                 /* VC                     */
+       0x0C0C,                 /* HI == C set && Z clear */
+       0xF3F3,                 /* LS == C clear || Z set */
+       0xAA55,                 /* GE == (N==V)           */
+       0x55AA,                 /* LT == (N!=V)           */
+       0x0A05,                 /* GT == (!Z && (N==V))   */
+       0xF5FA,                 /* LE == (Z || (N!=V))    */
+       0xFFFF,                 /* AL always              */
+       0                       /* NV                     */
+};
+
+/*
+ * Check if a trapped instruction should have been executed or not.
+ */
+bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu)
+{
+       unsigned long cpsr;
+       u32 cpsr_cond;
+       int cond;
+
+       /* Top two bits non-zero?  Unconditional. */
+       if (kvm_vcpu_get_hsr(vcpu) >> 30)
+               return true;
+
+       /* Is condition field valid? */
+       cond = kvm_vcpu_get_condition(vcpu);
+       if (cond == 0xE)
+               return true;
+
+       cpsr = *vcpu_cpsr(vcpu);
+
+       if (cond < 0) {
+               /* This can happen in Thumb mode: examine IT state. */
+               unsigned long it;
+
+               it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
+
+               /* it == 0 => unconditional. */
+               if (it == 0)
+                       return true;
+
+               /* The cond for this insn works out as the top 4 bits. */
+               cond = (it >> 4);
+       }
+
+       cpsr_cond = cpsr >> 28;
+
+       if (!((cc_map[cond] >> cpsr_cond) & 1))
+               return false;
+
+       return true;
+}
+
+/**
+ * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
+ * @vcpu:      The VCPU pointer
+ *
+ * When exceptions occur while instructions are executed in Thumb IF-THEN
+ * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
+ * to do this little bit of work manually. The fields map like this:
+ *
+ * IT[7:0] -> CPSR[26:25],CPSR[15:10]
+ */
+static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu)
+{
+       unsigned long itbits, cond;
+       unsigned long cpsr = *vcpu_cpsr(vcpu);
+       bool is_arm = !(cpsr & PSR_AA32_T_BIT);
+
+       if (is_arm || !(cpsr & PSR_AA32_IT_MASK))
+               return;
+
+       cond = (cpsr & 0xe000) >> 13;
+       itbits = (cpsr & 0x1c00) >> (10 - 2);
+       itbits |= (cpsr & (0x3 << 25)) >> 25;
+
+       /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */
+       if ((itbits & 0x7) == 0)
+               itbits = cond = 0;
+       else
+               itbits = (itbits << 1) & 0x1f;
+
+       cpsr &= ~PSR_AA32_IT_MASK;
+       cpsr |= cond << 13;
+       cpsr |= (itbits & 0x1c) << (10 - 2);
+       cpsr |= (itbits & 0x3) << 25;
+       *vcpu_cpsr(vcpu) = cpsr;
+}
+
+/**
+ * kvm_skip_instr - skip a trapped instruction and proceed to the next
+ * @vcpu: The vcpu pointer
+ */
+void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
+{
+       u32 pc = *vcpu_pc(vcpu);
+       bool is_thumb;
+
+       is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT);
+       if (is_thumb && !is_wide_instr)
+               pc += 2;
+       else
+               pc += 4;
+
+       *vcpu_pc(vcpu) = pc;
+
+       kvm_adjust_itstate(vcpu);
+}
diff --git a/arch/arm64/kvm/hyp/timer-sr.c b/arch/arm64/kvm/hyp/timer-sr.c
new file mode 100644 (file)
index 0000000..ff76e68
--- /dev/null
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <clocksource/arm_arch_timer.h>
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_hyp.h>
+
+void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high)
+{
+       u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low;
+       write_sysreg(cntvoff, cntvoff_el2);
+}
+
+/*
+ * Should only be called on non-VHE systems.
+ * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
+ */
+void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu)
+{
+       u64 val;
+
+       /* Allow physical timer/counter access for the host */
+       val = read_sysreg(cnthctl_el2);
+       val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
+       write_sysreg(val, cnthctl_el2);
+}
+
+/*
+ * Should only be called on non-VHE systems.
+ * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
+ */
+void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu)
+{
+       u64 val;
+
+       /*
+        * Disallow physical timer access for the guest
+        * Physical counter access is allowed
+        */
+       val = read_sysreg(cnthctl_el2);
+       val &= ~CNTHCTL_EL1PCEN;
+       val |= CNTHCTL_EL1PCTEN;
+       write_sysreg(val, cnthctl_el2);
+}
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
new file mode 100644 (file)
index 0000000..49fedf6
--- /dev/null
@@ -0,0 +1,1126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/compiler.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+#define vtr_to_max_lr_idx(v)           ((v) & 0xf)
+#define vtr_to_nr_pre_bits(v)          ((((u32)(v) >> 26) & 7) + 1)
+#define vtr_to_nr_apr_regs(v)          (1 << (vtr_to_nr_pre_bits(v) - 5))
+
+static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
+{
+       switch (lr & 0xf) {
+       case 0:
+               return read_gicreg(ICH_LR0_EL2);
+       case 1:
+               return read_gicreg(ICH_LR1_EL2);
+       case 2:
+               return read_gicreg(ICH_LR2_EL2);
+       case 3:
+               return read_gicreg(ICH_LR3_EL2);
+       case 4:
+               return read_gicreg(ICH_LR4_EL2);
+       case 5:
+               return read_gicreg(ICH_LR5_EL2);
+       case 6:
+               return read_gicreg(ICH_LR6_EL2);
+       case 7:
+               return read_gicreg(ICH_LR7_EL2);
+       case 8:
+               return read_gicreg(ICH_LR8_EL2);
+       case 9:
+               return read_gicreg(ICH_LR9_EL2);
+       case 10:
+               return read_gicreg(ICH_LR10_EL2);
+       case 11:
+               return read_gicreg(ICH_LR11_EL2);
+       case 12:
+               return read_gicreg(ICH_LR12_EL2);
+       case 13:
+               return read_gicreg(ICH_LR13_EL2);
+       case 14:
+               return read_gicreg(ICH_LR14_EL2);
+       case 15:
+               return read_gicreg(ICH_LR15_EL2);
+       }
+
+       unreachable();
+}
+
+static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
+{
+       switch (lr & 0xf) {
+       case 0:
+               write_gicreg(val, ICH_LR0_EL2);
+               break;
+       case 1:
+               write_gicreg(val, ICH_LR1_EL2);
+               break;
+       case 2:
+               write_gicreg(val, ICH_LR2_EL2);
+               break;
+       case 3:
+               write_gicreg(val, ICH_LR3_EL2);
+               break;
+       case 4:
+               write_gicreg(val, ICH_LR4_EL2);
+               break;
+       case 5:
+               write_gicreg(val, ICH_LR5_EL2);
+               break;
+       case 6:
+               write_gicreg(val, ICH_LR6_EL2);
+               break;
+       case 7:
+               write_gicreg(val, ICH_LR7_EL2);
+               break;
+       case 8:
+               write_gicreg(val, ICH_LR8_EL2);
+               break;
+       case 9:
+               write_gicreg(val, ICH_LR9_EL2);
+               break;
+       case 10:
+               write_gicreg(val, ICH_LR10_EL2);
+               break;
+       case 11:
+               write_gicreg(val, ICH_LR11_EL2);
+               break;
+       case 12:
+               write_gicreg(val, ICH_LR12_EL2);
+               break;
+       case 13:
+               write_gicreg(val, ICH_LR13_EL2);
+               break;
+       case 14:
+               write_gicreg(val, ICH_LR14_EL2);
+               break;
+       case 15:
+               write_gicreg(val, ICH_LR15_EL2);
+               break;
+       }
+}
+
+static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n)
+{
+       switch (n) {
+       case 0:
+               write_gicreg(val, ICH_AP0R0_EL2);
+               break;
+       case 1:
+               write_gicreg(val, ICH_AP0R1_EL2);
+               break;
+       case 2:
+               write_gicreg(val, ICH_AP0R2_EL2);
+               break;
+       case 3:
+               write_gicreg(val, ICH_AP0R3_EL2);
+               break;
+       }
+}
+
+static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n)
+{
+       switch (n) {
+       case 0:
+               write_gicreg(val, ICH_AP1R0_EL2);
+               break;
+       case 1:
+               write_gicreg(val, ICH_AP1R1_EL2);
+               break;
+       case 2:
+               write_gicreg(val, ICH_AP1R2_EL2);
+               break;
+       case 3:
+               write_gicreg(val, ICH_AP1R3_EL2);
+               break;
+       }
+}
+
+static u32 __hyp_text __vgic_v3_read_ap0rn(int n)
+{
+       u32 val;
+
+       switch (n) {
+       case 0:
+               val = read_gicreg(ICH_AP0R0_EL2);
+               break;
+       case 1:
+               val = read_gicreg(ICH_AP0R1_EL2);
+               break;
+       case 2:
+               val = read_gicreg(ICH_AP0R2_EL2);
+               break;
+       case 3:
+               val = read_gicreg(ICH_AP0R3_EL2);
+               break;
+       default:
+               unreachable();
+       }
+
+       return val;
+}
+
+static u32 __hyp_text __vgic_v3_read_ap1rn(int n)
+{
+       u32 val;
+
+       switch (n) {
+       case 0:
+               val = read_gicreg(ICH_AP1R0_EL2);
+               break;
+       case 1:
+               val = read_gicreg(ICH_AP1R1_EL2);
+               break;
+       case 2:
+               val = read_gicreg(ICH_AP1R2_EL2);
+               break;
+       case 3:
+               val = read_gicreg(ICH_AP1R3_EL2);
+               break;
+       default:
+               unreachable();
+       }
+
+       return val;
+}
+
+void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+
+       /*
+        * Make sure stores to the GIC via the memory mapped interface
+        * are now visible to the system register interface when reading the
+        * LRs, and when reading back the VMCR on non-VHE systems.
+        */
+       if (used_lrs || !has_vhe()) {
+               if (!cpu_if->vgic_sre) {
+                       dsb(sy);
+                       isb();
+               }
+       }
+
+       if (used_lrs || cpu_if->its_vpe.its_vm) {
+               int i;
+               u32 elrsr;
+
+               elrsr = read_gicreg(ICH_ELRSR_EL2);
+
+               write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
+
+               for (i = 0; i < used_lrs; i++) {
+                       if (elrsr & (1 << i))
+                               cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
+                       else
+                               cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
+
+                       __gic_v3_set_lr(0, i);
+               }
+       }
+}
+
+void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+       int i;
+
+       if (used_lrs || cpu_if->its_vpe.its_vm) {
+               write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+
+               for (i = 0; i < used_lrs; i++)
+                       __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
+       }
+
+       /*
+        * Ensure that writes to the LRs, and on non-VHE systems ensure that
+        * the write to the VMCR in __vgic_v3_activate_traps(), will have
+        * reached the (re)distributors. This ensure the guest will read the
+        * correct values from the memory-mapped interface.
+        */
+       if (used_lrs || !has_vhe()) {
+               if (!cpu_if->vgic_sre) {
+                       isb();
+                       dsb(sy);
+               }
+       }
+}
+
+void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       /*
+        * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
+        * Group0 interrupt (as generated in GICv2 mode) to be
+        * delivered as a FIQ to the guest, with potentially fatal
+        * consequences. So we must make sure that ICC_SRE_EL1 has
+        * been actually programmed with the value we want before
+        * starting to mess with the rest of the GIC, and VMCR_EL2 in
+        * particular.  This logic must be called before
+        * __vgic_v3_restore_state().
+        */
+       if (!cpu_if->vgic_sre) {
+               write_gicreg(0, ICC_SRE_EL1);
+               isb();
+               write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
+
+
+               if (has_vhe()) {
+                       /*
+                        * Ensure that the write to the VMCR will have reached
+                        * the (re)distributors. This ensure the guest will
+                        * read the correct values from the memory-mapped
+                        * interface.
+                        */
+                       isb();
+                       dsb(sy);
+               }
+       }
+
+       /*
+        * Prevent the guest from touching the GIC system registers if
+        * SRE isn't enabled for GICv3 emulation.
+        */
+       write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+                    ICC_SRE_EL2);
+
+       /*
+        * If we need to trap system registers, we must write
+        * ICH_HCR_EL2 anyway, even if no interrupts are being
+        * injected,
+        */
+       if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
+           cpu_if->its_vpe.its_vm)
+               write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+}
+
+void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u64 val;
+
+       if (!cpu_if->vgic_sre) {
+               cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
+       }
+
+       val = read_gicreg(ICC_SRE_EL2);
+       write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+
+       if (!cpu_if->vgic_sre) {
+               /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
+               isb();
+               write_gicreg(1, ICC_SRE_EL1);
+       }
+
+       /*
+        * If we were trapping system registers, we enabled the VGIC even if
+        * no interrupts were being injected, and we disable it again here.
+        */
+       if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
+           cpu_if->its_vpe.its_vm)
+               write_gicreg(0, ICH_HCR_EL2);
+}
+
+void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if;
+       u64 val;
+       u32 nr_pre_bits;
+
+       vcpu = kern_hyp_va(vcpu);
+       cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       val = read_gicreg(ICH_VTR_EL2);
+       nr_pre_bits = vtr_to_nr_pre_bits(val);
+
+       switch (nr_pre_bits) {
+       case 7:
+               cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
+               cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
+               /* Fall through */
+       case 6:
+               cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
+               /* Fall through */
+       default:
+               cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
+       }
+
+       switch (nr_pre_bits) {
+       case 7:
+               cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
+               cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
+               /* Fall through */
+       case 6:
+               cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
+               /* Fall through */
+       default:
+               cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
+       }
+}
+
+void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if;
+       u64 val;
+       u32 nr_pre_bits;
+
+       vcpu = kern_hyp_va(vcpu);
+       cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       val = read_gicreg(ICH_VTR_EL2);
+       nr_pre_bits = vtr_to_nr_pre_bits(val);
+
+       switch (nr_pre_bits) {
+       case 7:
+               __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
+               __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
+               /* Fall through */
+       case 6:
+               __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
+               /* Fall through */
+       default:
+               __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
+       }
+
+       switch (nr_pre_bits) {
+       case 7:
+               __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
+               __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
+               /* Fall through */
+       case 6:
+               __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
+               /* Fall through */
+       default:
+               __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
+       }
+}
+
+void __hyp_text __vgic_v3_init_lrs(void)
+{
+       int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
+       int i;
+
+       for (i = 0; i <= max_lr_idx; i++)
+               __gic_v3_set_lr(0, i);
+}
+
+u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void)
+{
+       return read_gicreg(ICH_VTR_EL2);
+}
+
+u64 __hyp_text __vgic_v3_read_vmcr(void)
+{
+       return read_gicreg(ICH_VMCR_EL2);
+}
+
+void __hyp_text __vgic_v3_write_vmcr(u32 vmcr)
+{
+       write_gicreg(vmcr, ICH_VMCR_EL2);
+}
+
+static int __hyp_text __vgic_v3_bpr_min(void)
+{
+       /* See Pseudocode for VPriorityGroup */
+       return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2));
+}
+
+static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu)
+{
+       u32 esr = kvm_vcpu_get_hsr(vcpu);
+       u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT;
+
+       return crm != 8;
+}
+
+#define GICv3_IDLE_PRIORITY    0xff
+
+static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu,
+                                                   u32 vmcr,
+                                                   u64 *lr_val)
+{
+       unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+       u8 priority = GICv3_IDLE_PRIORITY;
+       int i, lr = -1;
+
+       for (i = 0; i < used_lrs; i++) {
+               u64 val = __gic_v3_get_lr(i);
+               u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+
+               /* Not pending in the state? */
+               if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT)
+                       continue;
+
+               /* Group-0 interrupt, but Group-0 disabled? */
+               if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK))
+                       continue;
+
+               /* Group-1 interrupt, but Group-1 disabled? */
+               if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK))
+                       continue;
+
+               /* Not the highest priority? */
+               if (lr_prio >= priority)
+                       continue;
+
+               /* This is a candidate */
+               priority = lr_prio;
+               *lr_val = val;
+               lr = i;
+       }
+
+       if (lr == -1)
+               *lr_val = ICC_IAR1_EL1_SPURIOUS;
+
+       return lr;
+}
+
+static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu,
+                                              int intid, u64 *lr_val)
+{
+       unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+       int i;
+
+       for (i = 0; i < used_lrs; i++) {
+               u64 val = __gic_v3_get_lr(i);
+
+               if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid &&
+                   (val & ICH_LR_ACTIVE_BIT)) {
+                       *lr_val = val;
+                       return i;
+               }
+       }
+
+       *lr_val = ICC_IAR1_EL1_SPURIOUS;
+       return -1;
+}
+
+static int __hyp_text __vgic_v3_get_highest_active_priority(void)
+{
+       u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
+       u32 hap = 0;
+       int i;
+
+       for (i = 0; i < nr_apr_regs; i++) {
+               u32 val;
+
+               /*
+                * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers
+                * contain the active priority levels for this VCPU
+                * for the maximum number of supported priority
+                * levels, and we return the full priority level only
+                * if the BPR is programmed to its minimum, otherwise
+                * we return a combination of the priority level and
+                * subpriority, as determined by the setting of the
+                * BPR, but without the full subpriority.
+                */
+               val  = __vgic_v3_read_ap0rn(i);
+               val |= __vgic_v3_read_ap1rn(i);
+               if (!val) {
+                       hap += 32;
+                       continue;
+               }
+
+               return (hap + __ffs(val)) << __vgic_v3_bpr_min();
+       }
+
+       return GICv3_IDLE_PRIORITY;
+}
+
+static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr)
+{
+       return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
+}
+
+static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr)
+{
+       unsigned int bpr;
+
+       if (vmcr & ICH_VMCR_CBPR_MASK) {
+               bpr = __vgic_v3_get_bpr0(vmcr);
+               if (bpr < 7)
+                       bpr++;
+       } else {
+               bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
+       }
+
+       return bpr;
+}
+
+/*
+ * Convert a priority to a preemption level, taking the relevant BPR
+ * into account by zeroing the sub-priority bits.
+ */
+static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp)
+{
+       unsigned int bpr;
+
+       if (!grp)
+               bpr = __vgic_v3_get_bpr0(vmcr) + 1;
+       else
+               bpr = __vgic_v3_get_bpr1(vmcr);
+
+       return pri & (GENMASK(7, 0) << bpr);
+}
+
+/*
+ * The priority value is independent of any of the BPR values, so we
+ * normalize it using the minumal BPR value. This guarantees that no
+ * matter what the guest does with its BPR, we can always set/get the
+ * same value of a priority.
+ */
+static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp)
+{
+       u8 pre, ap;
+       u32 val;
+       int apr;
+
+       pre = __vgic_v3_pri_to_pre(pri, vmcr, grp);
+       ap = pre >> __vgic_v3_bpr_min();
+       apr = ap / 32;
+
+       if (!grp) {
+               val = __vgic_v3_read_ap0rn(apr);
+               __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr);
+       } else {
+               val = __vgic_v3_read_ap1rn(apr);
+               __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr);
+       }
+}
+
+static int __hyp_text __vgic_v3_clear_highest_active_priority(void)
+{
+       u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
+       u32 hap = 0;
+       int i;
+
+       for (i = 0; i < nr_apr_regs; i++) {
+               u32 ap0, ap1;
+               int c0, c1;
+
+               ap0 = __vgic_v3_read_ap0rn(i);
+               ap1 = __vgic_v3_read_ap1rn(i);
+               if (!ap0 && !ap1) {
+                       hap += 32;
+                       continue;
+               }
+
+               c0 = ap0 ? __ffs(ap0) : 32;
+               c1 = ap1 ? __ffs(ap1) : 32;
+
+               /* Always clear the LSB, which is the highest priority */
+               if (c0 < c1) {
+                       ap0 &= ~BIT(c0);
+                       __vgic_v3_write_ap0rn(ap0, i);
+                       hap += c0;
+               } else {
+                       ap1 &= ~BIT(c1);
+                       __vgic_v3_write_ap1rn(ap1, i);
+                       hap += c1;
+               }
+
+               /* Rescale to 8 bits of priority */
+               return hap << __vgic_v3_bpr_min();
+       }
+
+       return GICv3_IDLE_PRIORITY;
+}
+
+static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+       u64 lr_val;
+       u8 lr_prio, pmr;
+       int lr, grp;
+
+       grp = __vgic_v3_get_group(vcpu);
+
+       lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
+       if (lr < 0)
+               goto spurious;
+
+       if (grp != !!(lr_val & ICH_LR_GROUP))
+               goto spurious;
+
+       pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
+       lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+       if (pmr <= lr_prio)
+               goto spurious;
+
+       if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp))
+               goto spurious;
+
+       lr_val &= ~ICH_LR_STATE;
+       /* No active state for LPIs */
+       if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI)
+               lr_val |= ICH_LR_ACTIVE_BIT;
+       __gic_v3_set_lr(lr_val, lr);
+       __vgic_v3_set_active_priority(lr_prio, vmcr, grp);
+       vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
+       return;
+
+spurious:
+       vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS);
+}
+
+static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val)
+{
+       lr_val &= ~ICH_LR_ACTIVE_BIT;
+       if (lr_val & ICH_LR_HW) {
+               u32 pid;
+
+               pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT;
+               gic_write_dir(pid);
+       }
+
+       __gic_v3_set_lr(lr_val, lr);
+}
+
+static void __hyp_text __vgic_v3_bump_eoicount(void)
+{
+       u32 hcr;
+
+       hcr = read_gicreg(ICH_HCR_EL2);
+       hcr += 1 << ICH_HCR_EOIcount_SHIFT;
+       write_gicreg(hcr, ICH_HCR_EL2);
+}
+
+static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu,
+                                          u32 vmcr, int rt)
+{
+       u32 vid = vcpu_get_reg(vcpu, rt);
+       u64 lr_val;
+       int lr;
+
+       /* EOImode == 0, nothing to be done here */
+       if (!(vmcr & ICH_VMCR_EOIM_MASK))
+               return;
+
+       /* No deactivate to be performed on an LPI */
+       if (vid >= VGIC_MIN_LPI)
+               return;
+
+       lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
+       if (lr == -1) {
+               __vgic_v3_bump_eoicount();
+               return;
+       }
+
+       __vgic_v3_clear_active_lr(lr, lr_val);
+}
+
+static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+       u32 vid = vcpu_get_reg(vcpu, rt);
+       u64 lr_val;
+       u8 lr_prio, act_prio;
+       int lr, grp;
+
+       grp = __vgic_v3_get_group(vcpu);
+
+       /* Drop priority in any case */
+       act_prio = __vgic_v3_clear_highest_active_priority();
+
+       /* If EOIing an LPI, no deactivate to be performed */
+       if (vid >= VGIC_MIN_LPI)
+               return;
+
+       /* EOImode == 1, nothing to be done here */
+       if (vmcr & ICH_VMCR_EOIM_MASK)
+               return;
+
+       lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
+       if (lr == -1) {
+               __vgic_v3_bump_eoicount();
+               return;
+       }
+
+       lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+
+       /* If priorities or group do not match, the guest has fscked-up. */
+       if (grp != !!(lr_val & ICH_LR_GROUP) ||
+           __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio)
+               return;
+
+       /* Let's now perform the deactivation */
+       __vgic_v3_clear_active_lr(lr, lr_val);
+}
+
+static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+       vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK));
+}
+
+static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+       vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK));
+}
+
+static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+       u64 val = vcpu_get_reg(vcpu, rt);
+
+       if (val & 1)
+               vmcr |= ICH_VMCR_ENG0_MASK;
+       else
+               vmcr &= ~ICH_VMCR_ENG0_MASK;
+
+       __vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+       u64 val = vcpu_get_reg(vcpu, rt);
+
+       if (val & 1)
+               vmcr |= ICH_VMCR_ENG1_MASK;
+       else
+               vmcr &= ~ICH_VMCR_ENG1_MASK;
+
+       __vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+       vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr));
+}
+
+static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+       vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr));
+}
+
+static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+       u64 val = vcpu_get_reg(vcpu, rt);
+       u8 bpr_min = __vgic_v3_bpr_min() - 1;
+
+       /* Enforce BPR limiting */
+       if (val < bpr_min)
+               val = bpr_min;
+
+       val <<= ICH_VMCR_BPR0_SHIFT;
+       val &= ICH_VMCR_BPR0_MASK;
+       vmcr &= ~ICH_VMCR_BPR0_MASK;
+       vmcr |= val;
+
+       __vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+       u64 val = vcpu_get_reg(vcpu, rt);
+       u8 bpr_min = __vgic_v3_bpr_min();
+
+       if (vmcr & ICH_VMCR_CBPR_MASK)
+               return;
+
+       /* Enforce BPR limiting */
+       if (val < bpr_min)
+               val = bpr_min;
+
+       val <<= ICH_VMCR_BPR1_SHIFT;
+       val &= ICH_VMCR_BPR1_MASK;
+       vmcr &= ~ICH_VMCR_BPR1_MASK;
+       vmcr |= val;
+
+       __vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
+{
+       u32 val;
+
+       if (!__vgic_v3_get_group(vcpu))
+               val = __vgic_v3_read_ap0rn(n);
+       else
+               val = __vgic_v3_read_ap1rn(n);
+
+       vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
+{
+       u32 val = vcpu_get_reg(vcpu, rt);
+
+       if (!__vgic_v3_get_group(vcpu))
+               __vgic_v3_write_ap0rn(val, n);
+       else
+               __vgic_v3_write_ap1rn(val, n);
+}
+
+static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu,
+                                           u32 vmcr, int rt)
+{
+       __vgic_v3_read_apxrn(vcpu, rt, 0);
+}
+
+static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu,
+                                           u32 vmcr, int rt)
+{
+       __vgic_v3_read_apxrn(vcpu, rt, 1);
+}
+
+static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu,
+                                           u32 vmcr, int rt)
+{
+       __vgic_v3_read_apxrn(vcpu, rt, 2);
+}
+
+static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu,
+                                           u32 vmcr, int rt)
+{
+       __vgic_v3_read_apxrn(vcpu, rt, 3);
+}
+
+static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu,
+                                            u32 vmcr, int rt)
+{
+       __vgic_v3_write_apxrn(vcpu, rt, 0);
+}
+
+static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu,
+                                            u32 vmcr, int rt)
+{
+       __vgic_v3_write_apxrn(vcpu, rt, 1);
+}
+
+static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu,
+                                            u32 vmcr, int rt)
+{
+       __vgic_v3_write_apxrn(vcpu, rt, 2);
+}
+
+static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu,
+                                            u32 vmcr, int rt)
+{
+       __vgic_v3_write_apxrn(vcpu, rt, 3);
+}
+
+static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu,
+                                           u32 vmcr, int rt)
+{
+       u64 lr_val;
+       int lr, lr_grp, grp;
+
+       grp = __vgic_v3_get_group(vcpu);
+
+       lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
+       if (lr == -1)
+               goto spurious;
+
+       lr_grp = !!(lr_val & ICH_LR_GROUP);
+       if (lr_grp != grp)
+               lr_val = ICC_IAR1_EL1_SPURIOUS;
+
+spurious:
+       vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
+}
+
+static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu,
+                                         u32 vmcr, int rt)
+{
+       vmcr &= ICH_VMCR_PMR_MASK;
+       vmcr >>= ICH_VMCR_PMR_SHIFT;
+       vcpu_set_reg(vcpu, rt, vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu,
+                                          u32 vmcr, int rt)
+{
+       u32 val = vcpu_get_reg(vcpu, rt);
+
+       val <<= ICH_VMCR_PMR_SHIFT;
+       val &= ICH_VMCR_PMR_MASK;
+       vmcr &= ~ICH_VMCR_PMR_MASK;
+       vmcr |= val;
+
+       write_gicreg(vmcr, ICH_VMCR_EL2);
+}
+
+static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu,
+                                         u32 vmcr, int rt)
+{
+       u32 val = __vgic_v3_get_highest_active_priority();
+       vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu,
+                                          u32 vmcr, int rt)
+{
+       u32 vtr, val;
+
+       vtr = read_gicreg(ICH_VTR_EL2);
+       /* PRIbits */
+       val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
+       /* IDbits */
+       val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
+       /* SEIS */
+       val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT;
+       /* A3V */
+       val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
+       /* EOImode */
+       val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT;
+       /* CBPR */
+       val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
+
+       vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu,
+                                           u32 vmcr, int rt)
+{
+       u32 val = vcpu_get_reg(vcpu, rt);
+
+       if (val & ICC_CTLR_EL1_CBPR_MASK)
+               vmcr |= ICH_VMCR_CBPR_MASK;
+       else
+               vmcr &= ~ICH_VMCR_CBPR_MASK;
+
+       if (val & ICC_CTLR_EL1_EOImode_MASK)
+               vmcr |= ICH_VMCR_EOIM_MASK;
+       else
+               vmcr &= ~ICH_VMCR_EOIM_MASK;
+
+       write_gicreg(vmcr, ICH_VMCR_EL2);
+}
+
+int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
+{
+       int rt;
+       u32 esr;
+       u32 vmcr;
+       void (*fn)(struct kvm_vcpu *, u32, int);
+       bool is_read;
+       u32 sysreg;
+
+       esr = kvm_vcpu_get_hsr(vcpu);
+       if (vcpu_mode_is_32bit(vcpu)) {
+               if (!kvm_condition_valid(vcpu)) {
+                       __kvm_skip_instr(vcpu);
+                       return 1;
+               }
+
+               sysreg = esr_cp15_to_sysreg(esr);
+       } else {
+               sysreg = esr_sys64_to_sysreg(esr);
+       }
+
+       is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
+
+       switch (sysreg) {
+       case SYS_ICC_IAR0_EL1:
+       case SYS_ICC_IAR1_EL1:
+               if (unlikely(!is_read))
+                       return 0;
+               fn = __vgic_v3_read_iar;
+               break;
+       case SYS_ICC_EOIR0_EL1:
+       case SYS_ICC_EOIR1_EL1:
+               if (unlikely(is_read))
+                       return 0;
+               fn = __vgic_v3_write_eoir;
+               break;
+       case SYS_ICC_IGRPEN1_EL1:
+               if (is_read)
+                       fn = __vgic_v3_read_igrpen1;
+               else
+                       fn = __vgic_v3_write_igrpen1;
+               break;
+       case SYS_ICC_BPR1_EL1:
+               if (is_read)
+                       fn = __vgic_v3_read_bpr1;
+               else
+                       fn = __vgic_v3_write_bpr1;
+               break;
+       case SYS_ICC_AP0Rn_EL1(0):
+       case SYS_ICC_AP1Rn_EL1(0):
+               if (is_read)
+                       fn = __vgic_v3_read_apxr0;
+               else
+                       fn = __vgic_v3_write_apxr0;
+               break;
+       case SYS_ICC_AP0Rn_EL1(1):
+       case SYS_ICC_AP1Rn_EL1(1):
+               if (is_read)
+                       fn = __vgic_v3_read_apxr1;
+               else
+                       fn = __vgic_v3_write_apxr1;
+               break;
+       case SYS_ICC_AP0Rn_EL1(2):
+       case SYS_ICC_AP1Rn_EL1(2):
+               if (is_read)
+                       fn = __vgic_v3_read_apxr2;
+               else
+                       fn = __vgic_v3_write_apxr2;
+               break;
+       case SYS_ICC_AP0Rn_EL1(3):
+       case SYS_ICC_AP1Rn_EL1(3):
+               if (is_read)
+                       fn = __vgic_v3_read_apxr3;
+               else
+                       fn = __vgic_v3_write_apxr3;
+               break;
+       case SYS_ICC_HPPIR0_EL1:
+       case SYS_ICC_HPPIR1_EL1:
+               if (unlikely(!is_read))
+                       return 0;
+               fn = __vgic_v3_read_hppir;
+               break;
+       case SYS_ICC_IGRPEN0_EL1:
+               if (is_read)
+                       fn = __vgic_v3_read_igrpen0;
+               else
+                       fn = __vgic_v3_write_igrpen0;
+               break;
+       case SYS_ICC_BPR0_EL1:
+               if (is_read)
+                       fn = __vgic_v3_read_bpr0;
+               else
+                       fn = __vgic_v3_write_bpr0;
+               break;
+       case SYS_ICC_DIR_EL1:
+               if (unlikely(is_read))
+                       return 0;
+               fn = __vgic_v3_write_dir;
+               break;
+       case SYS_ICC_RPR_EL1:
+               if (unlikely(!is_read))
+                       return 0;
+               fn = __vgic_v3_read_rpr;
+               break;
+       case SYS_ICC_CTLR_EL1:
+               if (is_read)
+                       fn = __vgic_v3_read_ctlr;
+               else
+                       fn = __vgic_v3_write_ctlr;
+               break;
+       case SYS_ICC_PMR_EL1:
+               if (is_read)
+                       fn = __vgic_v3_read_pmr;
+               else
+                       fn = __vgic_v3_write_pmr;
+               break;
+       default:
+               return 0;
+       }
+
+       vmcr = __vgic_v3_read_vmcr();
+       rt = kvm_vcpu_sys_get_rt(vcpu);
+       fn(vcpu, vmcr, rt);
+
+       __kvm_skip_instr(vcpu);
+
+       return 1;
+}
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
new file mode 100644 (file)
index 0000000..550dfa3
--- /dev/null
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Arm Ltd.
+
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+
+#include <kvm/arm_hypercalls.h>
+#include <kvm/arm_psci.h>
+
+int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
+{
+       u32 func_id = smccc_get_function(vcpu);
+       long val = SMCCC_RET_NOT_SUPPORTED;
+       u32 feature;
+       gpa_t gpa;
+
+       switch (func_id) {
+       case ARM_SMCCC_VERSION_FUNC_ID:
+               val = ARM_SMCCC_VERSION_1_1;
+               break;
+       case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
+               feature = smccc_get_arg1(vcpu);
+               switch (feature) {
+               case ARM_SMCCC_ARCH_WORKAROUND_1:
+                       switch (kvm_arm_harden_branch_predictor()) {
+                       case KVM_BP_HARDEN_UNKNOWN:
+                               break;
+                       case KVM_BP_HARDEN_WA_NEEDED:
+                               val = SMCCC_RET_SUCCESS;
+                               break;
+                       case KVM_BP_HARDEN_NOT_REQUIRED:
+                               val = SMCCC_RET_NOT_REQUIRED;
+                               break;
+                       }
+                       break;
+               case ARM_SMCCC_ARCH_WORKAROUND_2:
+                       switch (kvm_arm_have_ssbd()) {
+                       case KVM_SSBD_FORCE_DISABLE:
+                       case KVM_SSBD_UNKNOWN:
+                               break;
+                       case KVM_SSBD_KERNEL:
+                               val = SMCCC_RET_SUCCESS;
+                               break;
+                       case KVM_SSBD_FORCE_ENABLE:
+                       case KVM_SSBD_MITIGATED:
+                               val = SMCCC_RET_NOT_REQUIRED;
+                               break;
+                       }
+                       break;
+               case ARM_SMCCC_HV_PV_TIME_FEATURES:
+                       val = SMCCC_RET_SUCCESS;
+                       break;
+               }
+               break;
+       case ARM_SMCCC_HV_PV_TIME_FEATURES:
+               val = kvm_hypercall_pv_features(vcpu);
+               break;
+       case ARM_SMCCC_HV_PV_TIME_ST:
+               gpa = kvm_init_stolen_time(vcpu);
+               if (gpa != GPA_INVALID)
+                       val = gpa;
+               break;
+       default:
+               return kvm_psci_call(vcpu);
+       }
+
+       smccc_set_retval(vcpu, val, 0, 0, 0);
+       return 1;
+}
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
new file mode 100644 (file)
index 0000000..aedfcff
--- /dev/null
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <trace/events/kvm.h>
+
+#include "trace.h"
+
+void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data)
+{
+       void *datap = NULL;
+       union {
+               u8      byte;
+               u16     hword;
+               u32     word;
+               u64     dword;
+       } tmp;
+
+       switch (len) {
+       case 1:
+               tmp.byte        = data;
+               datap           = &tmp.byte;
+               break;
+       case 2:
+               tmp.hword       = data;
+               datap           = &tmp.hword;
+               break;
+       case 4:
+               tmp.word        = data;
+               datap           = &tmp.word;
+               break;
+       case 8:
+               tmp.dword       = data;
+               datap           = &tmp.dword;
+               break;
+       }
+
+       memcpy(buf, datap, len);
+}
+
+unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
+{
+       unsigned long data = 0;
+       union {
+               u16     hword;
+               u32     word;
+               u64     dword;
+       } tmp;
+
+       switch (len) {
+       case 1:
+               data = *(u8 *)buf;
+               break;
+       case 2:
+               memcpy(&tmp.hword, buf, len);
+               data = tmp.hword;
+               break;
+       case 4:
+               memcpy(&tmp.word, buf, len);
+               data = tmp.word;
+               break;
+       case 8:
+               memcpy(&tmp.dword, buf, len);
+               data = tmp.dword;
+               break;
+       }
+
+       return data;
+}
+
+/**
+ * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
+ *                          or in-kernel IO emulation
+ *
+ * @vcpu: The VCPU pointer
+ * @run:  The VCPU run struct containing the mmio data
+ */
+int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+       unsigned long data;
+       unsigned int len;
+       int mask;
+
+       /* Detect an already handled MMIO return */
+       if (unlikely(!vcpu->mmio_needed))
+               return 0;
+
+       vcpu->mmio_needed = 0;
+
+       if (!kvm_vcpu_dabt_iswrite(vcpu)) {
+               len = kvm_vcpu_dabt_get_as(vcpu);
+               data = kvm_mmio_read_buf(run->mmio.data, len);
+
+               if (kvm_vcpu_dabt_issext(vcpu) &&
+                   len < sizeof(unsigned long)) {
+                       mask = 1U << ((len * 8) - 1);
+                       data = (data ^ mask) - mask;
+               }
+
+               if (!kvm_vcpu_dabt_issf(vcpu))
+                       data = data & 0xffffffff;
+
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
+                              &data);
+               data = vcpu_data_host_to_guest(vcpu, data, len);
+               vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data);
+       }
+
+       /*
+        * The MMIO instruction is emulated and should not be re-executed
+        * in the guest.
+        */
+       kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+
+       return 0;
+}
+
+int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
+                phys_addr_t fault_ipa)
+{
+       unsigned long data;
+       unsigned long rt;
+       int ret;
+       bool is_write;
+       int len;
+       u8 data_buf[8];
+
+       /*
+        * No valid syndrome? Ask userspace for help if it has
+        * voluntered to do so, and bail out otherwise.
+        */
+       if (!kvm_vcpu_dabt_isvalid(vcpu)) {
+               if (vcpu->kvm->arch.return_nisv_io_abort_to_user) {
+                       run->exit_reason = KVM_EXIT_ARM_NISV;
+                       run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu);
+                       run->arm_nisv.fault_ipa = fault_ipa;
+                       return 0;
+               }
+
+               kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n");
+               return -ENOSYS;
+       }
+
+       /* Page table accesses IO mem: tell guest to fix its TTBR */
+       if (kvm_vcpu_dabt_iss1tw(vcpu)) {
+               kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+               return 1;
+       }
+
+       /*
+        * Prepare MMIO operation. First decode the syndrome data we get
+        * from the CPU. Then try if some in-kernel emulation feels
+        * responsible, otherwise let user space do its magic.
+        */
+       is_write = kvm_vcpu_dabt_iswrite(vcpu);
+       len = kvm_vcpu_dabt_get_as(vcpu);
+       rt = kvm_vcpu_dabt_get_rd(vcpu);
+
+       if (is_write) {
+               data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
+                                              len);
+
+               trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data);
+               kvm_mmio_write_buf(data_buf, len, data);
+
+               ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
+                                      data_buf);
+       } else {
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
+                              fault_ipa, NULL);
+
+               ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
+                                     data_buf);
+       }
+
+       /* Now prepare kvm_run for the potential return to userland. */
+       run->mmio.is_write      = is_write;
+       run->mmio.phys_addr     = fault_ipa;
+       run->mmio.len           = len;
+       vcpu->mmio_needed       = 1;
+
+       if (!ret) {
+               /* We handled the access successfully in the kernel. */
+               if (!is_write)
+                       memcpy(run->mmio.data, data_buf, len);
+               vcpu->stat.mmio_exit_kernel++;
+               kvm_handle_mmio_return(vcpu, run);
+               return 1;
+       }
+
+       if (is_write)
+               memcpy(run->mmio.data, data_buf, len);
+       vcpu->stat.mmio_exit_user++;
+       run->exit_reason        = KVM_EXIT_MMIO;
+       return 0;
+}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
new file mode 100644 (file)
index 0000000..e3b9ee2
--- /dev/null
@@ -0,0 +1,2447 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/mman.h>
+#include <linux/kvm_host.h>
+#include <linux/io.h>
+#include <linux/hugetlb.h>
+#include <linux/sched/signal.h>
+#include <trace/events/kvm.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_ras.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/virt.h>
+
+#include "trace.h"
+
+static pgd_t *boot_hyp_pgd;
+static pgd_t *hyp_pgd;
+static pgd_t *merged_hyp_pgd;
+static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
+
+static unsigned long hyp_idmap_start;
+static unsigned long hyp_idmap_end;
+static phys_addr_t hyp_idmap_vector;
+
+static unsigned long io_map_base;
+
+#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
+
+#define KVM_S2PTE_FLAG_IS_IOMAP                (1UL << 0)
+#define KVM_S2_FLAG_LOGGING_ACTIVE     (1UL << 1)
+
+static bool is_iomap(unsigned long flags)
+{
+       return flags & KVM_S2PTE_FLAG_IS_IOMAP;
+}
+
+static bool memslot_is_logging(struct kvm_memory_slot *memslot)
+{
+       return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
+}
+
+/**
+ * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
+ * @kvm:       pointer to kvm structure.
+ *
+ * Interface to HYP function to flush all VM TLB entries
+ */
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+       kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
+}
+
+static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+{
+       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
+}
+
+/*
+ * D-Cache management functions. They take the page table entries by
+ * value, as they are flushing the cache using the kernel mapping (or
+ * kmap on 32bit).
+ */
+static void kvm_flush_dcache_pte(pte_t pte)
+{
+       __kvm_flush_dcache_pte(pte);
+}
+
+static void kvm_flush_dcache_pmd(pmd_t pmd)
+{
+       __kvm_flush_dcache_pmd(pmd);
+}
+
+static void kvm_flush_dcache_pud(pud_t pud)
+{
+       __kvm_flush_dcache_pud(pud);
+}
+
+static bool kvm_is_device_pfn(unsigned long pfn)
+{
+       return !pfn_valid(pfn);
+}
+
+/**
+ * stage2_dissolve_pmd() - clear and flush huge PMD entry
+ * @kvm:       pointer to kvm structure.
+ * @addr:      IPA
+ * @pmd:       pmd pointer for IPA
+ *
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
+ */
+static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
+{
+       if (!pmd_thp_or_huge(*pmd))
+               return;
+
+       pmd_clear(pmd);
+       kvm_tlb_flush_vmid_ipa(kvm, addr);
+       put_page(virt_to_page(pmd));
+}
+
+/**
+ * stage2_dissolve_pud() - clear and flush huge PUD entry
+ * @kvm:       pointer to kvm structure.
+ * @addr:      IPA
+ * @pud:       pud pointer for IPA
+ *
+ * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
+ */
+static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
+{
+       if (!stage2_pud_huge(kvm, *pudp))
+               return;
+
+       stage2_pud_clear(kvm, pudp);
+       kvm_tlb_flush_vmid_ipa(kvm, addr);
+       put_page(virt_to_page(pudp));
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+                                 int min, int max)
+{
+       void *page;
+
+       BUG_ON(max > KVM_NR_MEM_OBJS);
+       if (cache->nobjs >= min)
+               return 0;
+       while (cache->nobjs < max) {
+               page = (void *)__get_free_page(GFP_PGTABLE_USER);
+               if (!page)
+                       return -ENOMEM;
+               cache->objects[cache->nobjs++] = page;
+       }
+       return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+       while (mc->nobjs)
+               free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
+{
+       void *p;
+
+       BUG_ON(!mc || !mc->nobjs);
+       p = mc->objects[--mc->nobjs];
+       return p;
+}
+
+static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
+{
+       pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL);
+       stage2_pgd_clear(kvm, pgd);
+       kvm_tlb_flush_vmid_ipa(kvm, addr);
+       stage2_pud_free(kvm, pud_table);
+       put_page(virt_to_page(pgd));
+}
+
+static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
+{
+       pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
+       VM_BUG_ON(stage2_pud_huge(kvm, *pud));
+       stage2_pud_clear(kvm, pud);
+       kvm_tlb_flush_vmid_ipa(kvm, addr);
+       stage2_pmd_free(kvm, pmd_table);
+       put_page(virt_to_page(pud));
+}
+
+static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
+{
+       pte_t *pte_table = pte_offset_kernel(pmd, 0);
+       VM_BUG_ON(pmd_thp_or_huge(*pmd));
+       pmd_clear(pmd);
+       kvm_tlb_flush_vmid_ipa(kvm, addr);
+       free_page((unsigned long)pte_table);
+       put_page(virt_to_page(pmd));
+}
+
+static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
+{
+       WRITE_ONCE(*ptep, new_pte);
+       dsb(ishst);
+}
+
+static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
+{
+       WRITE_ONCE(*pmdp, new_pmd);
+       dsb(ishst);
+}
+
+static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
+{
+       kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
+}
+
+static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
+{
+       WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
+       dsb(ishst);
+}
+
+static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp)
+{
+       WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp));
+       dsb(ishst);
+}
+
+/*
+ * Unmapping vs dcache management:
+ *
+ * If a guest maps certain memory pages as uncached, all writes will
+ * bypass the data cache and go directly to RAM.  However, the CPUs
+ * can still speculate reads (not writes) and fill cache lines with
+ * data.
+ *
+ * Those cache lines will be *clean* cache lines though, so a
+ * clean+invalidate operation is equivalent to an invalidate
+ * operation, because no cache lines are marked dirty.
+ *
+ * Those clean cache lines could be filled prior to an uncached write
+ * by the guest, and the cache coherent IO subsystem would therefore
+ * end up writing old data to disk.
+ *
+ * This is why right after unmapping a page/section and invalidating
+ * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
+ * the IO subsystem will never hit in the cache.
+ *
+ * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
+ * we then fully enforce cacheability of RAM, no matter what the guest
+ * does.
+ */
+static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
+                      phys_addr_t addr, phys_addr_t end)
+{
+       phys_addr_t start_addr = addr;
+       pte_t *pte, *start_pte;
+
+       start_pte = pte = pte_offset_kernel(pmd, addr);
+       do {
+               if (!pte_none(*pte)) {
+                       pte_t old_pte = *pte;
+
+                       kvm_set_pte(pte, __pte(0));
+                       kvm_tlb_flush_vmid_ipa(kvm, addr);
+
+                       /* No need to invalidate the cache for device mappings */
+                       if (!kvm_is_device_pfn(pte_pfn(old_pte)))
+                               kvm_flush_dcache_pte(old_pte);
+
+                       put_page(virt_to_page(pte));
+               }
+       } while (pte++, addr += PAGE_SIZE, addr != end);
+
+       if (stage2_pte_table_empty(kvm, start_pte))
+               clear_stage2_pmd_entry(kvm, pmd, start_addr);
+}
+
+static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
+                      phys_addr_t addr, phys_addr_t end)
+{
+       phys_addr_t next, start_addr = addr;
+       pmd_t *pmd, *start_pmd;
+
+       start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
+       do {
+               next = stage2_pmd_addr_end(kvm, addr, end);
+               if (!pmd_none(*pmd)) {
+                       if (pmd_thp_or_huge(*pmd)) {
+                               pmd_t old_pmd = *pmd;
+
+                               pmd_clear(pmd);
+                               kvm_tlb_flush_vmid_ipa(kvm, addr);
+
+                               kvm_flush_dcache_pmd(old_pmd);
+
+                               put_page(virt_to_page(pmd));
+                       } else {
+                               unmap_stage2_ptes(kvm, pmd, addr, next);
+                       }
+               }
+       } while (pmd++, addr = next, addr != end);
+
+       if (stage2_pmd_table_empty(kvm, start_pmd))
+               clear_stage2_pud_entry(kvm, pud, start_addr);
+}
+
+static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
+                      phys_addr_t addr, phys_addr_t end)
+{
+       phys_addr_t next, start_addr = addr;
+       pud_t *pud, *start_pud;
+
+       start_pud = pud = stage2_pud_offset(kvm, pgd, addr);
+       do {
+               next = stage2_pud_addr_end(kvm, addr, end);
+               if (!stage2_pud_none(kvm, *pud)) {
+                       if (stage2_pud_huge(kvm, *pud)) {
+                               pud_t old_pud = *pud;
+
+                               stage2_pud_clear(kvm, pud);
+                               kvm_tlb_flush_vmid_ipa(kvm, addr);
+                               kvm_flush_dcache_pud(old_pud);
+                               put_page(virt_to_page(pud));
+                       } else {
+                               unmap_stage2_pmds(kvm, pud, addr, next);
+                       }
+               }
+       } while (pud++, addr = next, addr != end);
+
+       if (stage2_pud_table_empty(kvm, start_pud))
+               clear_stage2_pgd_entry(kvm, pgd, start_addr);
+}
+
+/**
+ * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+ * @kvm:   The VM pointer
+ * @start: The intermediate physical base address of the range to unmap
+ * @size:  The size of the area to unmap
+ *
+ * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
+ * be called while holding mmu_lock (unless for freeing the stage2 pgd before
+ * destroying the VM), otherwise another faulting VCPU may come in and mess
+ * with things behind our backs.
+ */
+static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
+{
+       pgd_t *pgd;
+       phys_addr_t addr = start, end = start + size;
+       phys_addr_t next;
+
+       assert_spin_locked(&kvm->mmu_lock);
+       WARN_ON(size & ~PAGE_MASK);
+
+       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+       do {
+               /*
+                * Make sure the page table is still active, as another thread
+                * could have possibly freed the page table, while we released
+                * the lock.
+                */
+               if (!READ_ONCE(kvm->arch.pgd))
+                       break;
+               next = stage2_pgd_addr_end(kvm, addr, end);
+               if (!stage2_pgd_none(kvm, *pgd))
+                       unmap_stage2_puds(kvm, pgd, addr, next);
+               /*
+                * If the range is too large, release the kvm->mmu_lock
+                * to prevent starvation and lockup detector warnings.
+                */
+               if (next != end)
+                       cond_resched_lock(&kvm->mmu_lock);
+       } while (pgd++, addr = next, addr != end);
+}
+
+static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
+                             phys_addr_t addr, phys_addr_t end)
+{
+       pte_t *pte;
+
+       pte = pte_offset_kernel(pmd, addr);
+       do {
+               if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
+                       kvm_flush_dcache_pte(*pte);
+       } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
+                             phys_addr_t addr, phys_addr_t end)
+{
+       pmd_t *pmd;
+       phys_addr_t next;
+
+       pmd = stage2_pmd_offset(kvm, pud, addr);
+       do {
+               next = stage2_pmd_addr_end(kvm, addr, end);
+               if (!pmd_none(*pmd)) {
+                       if (pmd_thp_or_huge(*pmd))
+                               kvm_flush_dcache_pmd(*pmd);
+                       else
+                               stage2_flush_ptes(kvm, pmd, addr, next);
+               }
+       } while (pmd++, addr = next, addr != end);
+}
+
+static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
+                             phys_addr_t addr, phys_addr_t end)
+{
+       pud_t *pud;
+       phys_addr_t next;
+
+       pud = stage2_pud_offset(kvm, pgd, addr);
+       do {
+               next = stage2_pud_addr_end(kvm, addr, end);
+               if (!stage2_pud_none(kvm, *pud)) {
+                       if (stage2_pud_huge(kvm, *pud))
+                               kvm_flush_dcache_pud(*pud);
+                       else
+                               stage2_flush_pmds(kvm, pud, addr, next);
+               }
+       } while (pud++, addr = next, addr != end);
+}
+
+static void stage2_flush_memslot(struct kvm *kvm,
+                                struct kvm_memory_slot *memslot)
+{
+       phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+       phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
+       phys_addr_t next;
+       pgd_t *pgd;
+
+       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+       do {
+               next = stage2_pgd_addr_end(kvm, addr, end);
+               if (!stage2_pgd_none(kvm, *pgd))
+                       stage2_flush_puds(kvm, pgd, addr, next);
+       } while (pgd++, addr = next, addr != end);
+}
+
+/**
+ * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the stage 2 page tables and invalidate any cache lines
+ * backing memory already mapped to the VM.
+ */
+static void stage2_flush_vm(struct kvm *kvm)
+{
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *memslot;
+       int idx;
+
+       idx = srcu_read_lock(&kvm->srcu);
+       spin_lock(&kvm->mmu_lock);
+
+       slots = kvm_memslots(kvm);
+       kvm_for_each_memslot(memslot, slots)
+               stage2_flush_memslot(kvm, memslot);
+
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
+}
+
+static void clear_hyp_pgd_entry(pgd_t *pgd)
+{
+       pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
+       pgd_clear(pgd);
+       pud_free(NULL, pud_table);
+       put_page(virt_to_page(pgd));
+}
+
+static void clear_hyp_pud_entry(pud_t *pud)
+{
+       pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
+       VM_BUG_ON(pud_huge(*pud));
+       pud_clear(pud);
+       pmd_free(NULL, pmd_table);
+       put_page(virt_to_page(pud));
+}
+
+static void clear_hyp_pmd_entry(pmd_t *pmd)
+{
+       pte_t *pte_table = pte_offset_kernel(pmd, 0);
+       VM_BUG_ON(pmd_thp_or_huge(*pmd));
+       pmd_clear(pmd);
+       pte_free_kernel(NULL, pte_table);
+       put_page(virt_to_page(pmd));
+}
+
+static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+       pte_t *pte, *start_pte;
+
+       start_pte = pte = pte_offset_kernel(pmd, addr);
+       do {
+               if (!pte_none(*pte)) {
+                       kvm_set_pte(pte, __pte(0));
+                       put_page(virt_to_page(pte));
+               }
+       } while (pte++, addr += PAGE_SIZE, addr != end);
+
+       if (hyp_pte_table_empty(start_pte))
+               clear_hyp_pmd_entry(pmd);
+}
+
+static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+{
+       phys_addr_t next;
+       pmd_t *pmd, *start_pmd;
+
+       start_pmd = pmd = pmd_offset(pud, addr);
+       do {
+               next = pmd_addr_end(addr, end);
+               /* Hyp doesn't use huge pmds */
+               if (!pmd_none(*pmd))
+                       unmap_hyp_ptes(pmd, addr, next);
+       } while (pmd++, addr = next, addr != end);
+
+       if (hyp_pmd_table_empty(start_pmd))
+               clear_hyp_pud_entry(pud);
+}
+
+static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
+{
+       phys_addr_t next;
+       pud_t *pud, *start_pud;
+
+       start_pud = pud = pud_offset(pgd, addr);
+       do {
+               next = pud_addr_end(addr, end);
+               /* Hyp doesn't use huge puds */
+               if (!pud_none(*pud))
+                       unmap_hyp_pmds(pud, addr, next);
+       } while (pud++, addr = next, addr != end);
+
+       if (hyp_pud_table_empty(start_pud))
+               clear_hyp_pgd_entry(pgd);
+}
+
+static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
+{
+       return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
+}
+
+static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
+                             phys_addr_t start, u64 size)
+{
+       pgd_t *pgd;
+       phys_addr_t addr = start, end = start + size;
+       phys_addr_t next;
+
+       /*
+        * We don't unmap anything from HYP, except at the hyp tear down.
+        * Hence, we don't have to invalidate the TLBs here.
+        */
+       pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
+       do {
+               next = pgd_addr_end(addr, end);
+               if (!pgd_none(*pgd))
+                       unmap_hyp_puds(pgd, addr, next);
+       } while (pgd++, addr = next, addr != end);
+}
+
+static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+{
+       __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
+}
+
+static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+{
+       __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
+}
+
+/**
+ * free_hyp_pgds - free Hyp-mode page tables
+ *
+ * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
+ * therefore contains either mappings in the kernel memory area (above
+ * PAGE_OFFSET), or device mappings in the idmap range.
+ *
+ * boot_hyp_pgd should only map the idmap range, and is only used in
+ * the extended idmap case.
+ */
+void free_hyp_pgds(void)
+{
+       pgd_t *id_pgd;
+
+       mutex_lock(&kvm_hyp_pgd_mutex);
+
+       id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
+
+       if (id_pgd) {
+               /* In case we never called hyp_mmu_init() */
+               if (!io_map_base)
+                       io_map_base = hyp_idmap_start;
+               unmap_hyp_idmap_range(id_pgd, io_map_base,
+                                     hyp_idmap_start + PAGE_SIZE - io_map_base);
+       }
+
+       if (boot_hyp_pgd) {
+               free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
+               boot_hyp_pgd = NULL;
+       }
+
+       if (hyp_pgd) {
+               unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
+                               (uintptr_t)high_memory - PAGE_OFFSET);
+
+               free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
+               hyp_pgd = NULL;
+       }
+       if (merged_hyp_pgd) {
+               clear_page(merged_hyp_pgd);
+               free_page((unsigned long)merged_hyp_pgd);
+               merged_hyp_pgd = NULL;
+       }
+
+       mutex_unlock(&kvm_hyp_pgd_mutex);
+}
+
+static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
+                                   unsigned long end, unsigned long pfn,
+                                   pgprot_t prot)
+{
+       pte_t *pte;
+       unsigned long addr;
+
+       addr = start;
+       do {
+               pte = pte_offset_kernel(pmd, addr);
+               kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
+               get_page(virt_to_page(pte));
+               pfn++;
+       } while (addr += PAGE_SIZE, addr != end);
+}
+
+static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
+                                  unsigned long end, unsigned long pfn,
+                                  pgprot_t prot)
+{
+       pmd_t *pmd;
+       pte_t *pte;
+       unsigned long addr, next;
+
+       addr = start;
+       do {
+               pmd = pmd_offset(pud, addr);
+
+               BUG_ON(pmd_sect(*pmd));
+
+               if (pmd_none(*pmd)) {
+                       pte = pte_alloc_one_kernel(NULL);
+                       if (!pte) {
+                               kvm_err("Cannot allocate Hyp pte\n");
+                               return -ENOMEM;
+                       }
+                       kvm_pmd_populate(pmd, pte);
+                       get_page(virt_to_page(pmd));
+               }
+
+               next = pmd_addr_end(addr, end);
+
+               create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
+               pfn += (next - addr) >> PAGE_SHIFT;
+       } while (addr = next, addr != end);
+
+       return 0;
+}
+
+static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
+                                  unsigned long end, unsigned long pfn,
+                                  pgprot_t prot)
+{
+       pud_t *pud;
+       pmd_t *pmd;
+       unsigned long addr, next;
+       int ret;
+
+       addr = start;
+       do {
+               pud = pud_offset(pgd, addr);
+
+               if (pud_none_or_clear_bad(pud)) {
+                       pmd = pmd_alloc_one(NULL, addr);
+                       if (!pmd) {
+                               kvm_err("Cannot allocate Hyp pmd\n");
+                               return -ENOMEM;
+                       }
+                       kvm_pud_populate(pud, pmd);
+                       get_page(virt_to_page(pud));
+               }
+
+               next = pud_addr_end(addr, end);
+               ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
+               if (ret)
+                       return ret;
+               pfn += (next - addr) >> PAGE_SHIFT;
+       } while (addr = next, addr != end);
+
+       return 0;
+}
+
+static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
+                                unsigned long start, unsigned long end,
+                                unsigned long pfn, pgprot_t prot)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       unsigned long addr, next;
+       int err = 0;
+
+       mutex_lock(&kvm_hyp_pgd_mutex);
+       addr = start & PAGE_MASK;
+       end = PAGE_ALIGN(end);
+       do {
+               pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
+
+               if (pgd_none(*pgd)) {
+                       pud = pud_alloc_one(NULL, addr);
+                       if (!pud) {
+                               kvm_err("Cannot allocate Hyp pud\n");
+                               err = -ENOMEM;
+                               goto out;
+                       }
+                       kvm_pgd_populate(pgd, pud);
+                       get_page(virt_to_page(pgd));
+               }
+
+               next = pgd_addr_end(addr, end);
+               err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
+               if (err)
+                       goto out;
+               pfn += (next - addr) >> PAGE_SHIFT;
+       } while (addr = next, addr != end);
+out:
+       mutex_unlock(&kvm_hyp_pgd_mutex);
+       return err;
+}
+
+static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
+{
+       if (!is_vmalloc_addr(kaddr)) {
+               BUG_ON(!virt_addr_valid(kaddr));
+               return __pa(kaddr);
+       } else {
+               return page_to_phys(vmalloc_to_page(kaddr)) +
+                      offset_in_page(kaddr);
+       }
+}
+
+/**
+ * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
+ * @from:      The virtual kernel start address of the range
+ * @to:                The virtual kernel end address of the range (exclusive)
+ * @prot:      The protection to be applied to this range
+ *
+ * The same virtual address as the kernel virtual address is also used
+ * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
+ * physical pages.
+ */
+int create_hyp_mappings(void *from, void *to, pgprot_t prot)
+{
+       phys_addr_t phys_addr;
+       unsigned long virt_addr;
+       unsigned long start = kern_hyp_va((unsigned long)from);
+       unsigned long end = kern_hyp_va((unsigned long)to);
+
+       if (is_kernel_in_hyp_mode())
+               return 0;
+
+       start = start & PAGE_MASK;
+       end = PAGE_ALIGN(end);
+
+       for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
+               int err;
+
+               phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
+               err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
+                                           virt_addr, virt_addr + PAGE_SIZE,
+                                           __phys_to_pfn(phys_addr),
+                                           prot);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
+                                       unsigned long *haddr, pgprot_t prot)
+{
+       pgd_t *pgd = hyp_pgd;
+       unsigned long base;
+       int ret = 0;
+
+       mutex_lock(&kvm_hyp_pgd_mutex);
+
+       /*
+        * This assumes that we we have enough space below the idmap
+        * page to allocate our VAs. If not, the check below will
+        * kick. A potential alternative would be to detect that
+        * overflow and switch to an allocation above the idmap.
+        *
+        * The allocated size is always a multiple of PAGE_SIZE.
+        */
+       size = PAGE_ALIGN(size + offset_in_page(phys_addr));
+       base = io_map_base - size;
+
+       /*
+        * Verify that BIT(VA_BITS - 1) hasn't been flipped by
+        * allocating the new area, as it would indicate we've
+        * overflowed the idmap/IO address range.
+        */
+       if ((base ^ io_map_base) & BIT(VA_BITS - 1))
+               ret = -ENOMEM;
+       else
+               io_map_base = base;
+
+       mutex_unlock(&kvm_hyp_pgd_mutex);
+
+       if (ret)
+               goto out;
+
+       if (__kvm_cpu_uses_extended_idmap())
+               pgd = boot_hyp_pgd;
+
+       ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
+                                   base, base + size,
+                                   __phys_to_pfn(phys_addr), prot);
+       if (ret)
+               goto out;
+
+       *haddr = base + offset_in_page(phys_addr);
+
+out:
+       return ret;
+}
+
+/**
+ * create_hyp_io_mappings - Map IO into both kernel and HYP
+ * @phys_addr: The physical start address which gets mapped
+ * @size:      Size of the region being mapped
+ * @kaddr:     Kernel VA for this mapping
+ * @haddr:     HYP VA for this mapping
+ */
+int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
+                          void __iomem **kaddr,
+                          void __iomem **haddr)
+{
+       unsigned long addr;
+       int ret;
+
+       *kaddr = ioremap(phys_addr, size);
+       if (!*kaddr)
+               return -ENOMEM;
+
+       if (is_kernel_in_hyp_mode()) {
+               *haddr = *kaddr;
+               return 0;
+       }
+
+       ret = __create_hyp_private_mapping(phys_addr, size,
+                                          &addr, PAGE_HYP_DEVICE);
+       if (ret) {
+               iounmap(*kaddr);
+               *kaddr = NULL;
+               *haddr = NULL;
+               return ret;
+       }
+
+       *haddr = (void __iomem *)addr;
+       return 0;
+}
+
+/**
+ * create_hyp_exec_mappings - Map an executable range into HYP
+ * @phys_addr: The physical start address which gets mapped
+ * @size:      Size of the region being mapped
+ * @haddr:     HYP VA for this mapping
+ */
+int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
+                            void **haddr)
+{
+       unsigned long addr;
+       int ret;
+
+       BUG_ON(is_kernel_in_hyp_mode());
+
+       ret = __create_hyp_private_mapping(phys_addr, size,
+                                          &addr, PAGE_HYP_EXEC);
+       if (ret) {
+               *haddr = NULL;
+               return ret;
+       }
+
+       *haddr = (void *)addr;
+       return 0;
+}
+
+/**
+ * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
+ * @kvm:       The KVM struct pointer for the VM.
+ *
+ * Allocates only the stage-2 HW PGD level table(s) of size defined by
+ * stage2_pgd_size(kvm).
+ *
+ * Note we don't need locking here as this is only called when the VM is
+ * created, which can only be done once.
+ */
+int kvm_alloc_stage2_pgd(struct kvm *kvm)
+{
+       phys_addr_t pgd_phys;
+       pgd_t *pgd;
+
+       if (kvm->arch.pgd != NULL) {
+               kvm_err("kvm_arch already initialized?\n");
+               return -EINVAL;
+       }
+
+       /* Allocate the HW PGD, making sure that each page gets its own refcount */
+       pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
+       if (!pgd)
+               return -ENOMEM;
+
+       pgd_phys = virt_to_phys(pgd);
+       if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
+               return -EINVAL;
+
+       kvm->arch.pgd = pgd;
+       kvm->arch.pgd_phys = pgd_phys;
+       return 0;
+}
+
+static void stage2_unmap_memslot(struct kvm *kvm,
+                                struct kvm_memory_slot *memslot)
+{
+       hva_t hva = memslot->userspace_addr;
+       phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+       phys_addr_t size = PAGE_SIZE * memslot->npages;
+       hva_t reg_end = hva + size;
+
+       /*
+        * A memory region could potentially cover multiple VMAs, and any holes
+        * between them, so iterate over all of them to find out if we should
+        * unmap any of them.
+        *
+        *     +--------------------------------------------+
+        * +---------------+----------------+   +----------------+
+        * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
+        * +---------------+----------------+   +----------------+
+        *     |               memory region                |
+        *     +--------------------------------------------+
+        */
+       do {
+               struct vm_area_struct *vma = find_vma(current->mm, hva);
+               hva_t vm_start, vm_end;
+
+               if (!vma || vma->vm_start >= reg_end)
+                       break;
+
+               /*
+                * Take the intersection of this VMA with the memory region
+                */
+               vm_start = max(hva, vma->vm_start);
+               vm_end = min(reg_end, vma->vm_end);
+
+               if (!(vma->vm_flags & VM_PFNMAP)) {
+                       gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
+                       unmap_stage2_range(kvm, gpa, vm_end - vm_start);
+               }
+               hva = vm_end;
+       } while (hva < reg_end);
+}
+
+/**
+ * stage2_unmap_vm - Unmap Stage-2 RAM mappings
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the memregions and unmap any reguler RAM
+ * backing memory already mapped to the VM.
+ */
+void stage2_unmap_vm(struct kvm *kvm)
+{
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *memslot;
+       int idx;
+
+       idx = srcu_read_lock(&kvm->srcu);
+       down_read(&current->mm->mmap_sem);
+       spin_lock(&kvm->mmu_lock);
+
+       slots = kvm_memslots(kvm);
+       kvm_for_each_memslot(memslot, slots)
+               stage2_unmap_memslot(kvm, memslot);
+
+       spin_unlock(&kvm->mmu_lock);
+       up_read(&current->mm->mmap_sem);
+       srcu_read_unlock(&kvm->srcu, idx);
+}
+
+/**
+ * kvm_free_stage2_pgd - free all stage-2 tables
+ * @kvm:       The KVM struct pointer for the VM.
+ *
+ * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
+ * underlying level-2 and level-3 tables before freeing the actual level-1 table
+ * and setting the struct pointer to NULL.
+ */
+void kvm_free_stage2_pgd(struct kvm *kvm)
+{
+       void *pgd = NULL;
+
+       spin_lock(&kvm->mmu_lock);
+       if (kvm->arch.pgd) {
+               unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
+               pgd = READ_ONCE(kvm->arch.pgd);
+               kvm->arch.pgd = NULL;
+               kvm->arch.pgd_phys = 0;
+       }
+       spin_unlock(&kvm->mmu_lock);
+
+       /* Free the HW pgd, one page at a time */
+       if (pgd)
+               free_pages_exact(pgd, stage2_pgd_size(kvm));
+}
+
+static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+                            phys_addr_t addr)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+
+       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+       if (stage2_pgd_none(kvm, *pgd)) {
+               if (!cache)
+                       return NULL;
+               pud = mmu_memory_cache_alloc(cache);
+               stage2_pgd_populate(kvm, pgd, pud);
+               get_page(virt_to_page(pgd));
+       }
+
+       return stage2_pud_offset(kvm, pgd, addr);
+}
+
+static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+                            phys_addr_t addr)
+{
+       pud_t *pud;
+       pmd_t *pmd;
+
+       pud = stage2_get_pud(kvm, cache, addr);
+       if (!pud || stage2_pud_huge(kvm, *pud))
+               return NULL;
+
+       if (stage2_pud_none(kvm, *pud)) {
+               if (!cache)
+                       return NULL;
+               pmd = mmu_memory_cache_alloc(cache);
+               stage2_pud_populate(kvm, pud, pmd);
+               get_page(virt_to_page(pud));
+       }
+
+       return stage2_pmd_offset(kvm, pud, addr);
+}
+
+static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
+                              *cache, phys_addr_t addr, const pmd_t *new_pmd)
+{
+       pmd_t *pmd, old_pmd;
+
+retry:
+       pmd = stage2_get_pmd(kvm, cache, addr);
+       VM_BUG_ON(!pmd);
+
+       old_pmd = *pmd;
+       /*
+        * Multiple vcpus faulting on the same PMD entry, can
+        * lead to them sequentially updating the PMD with the
+        * same value. Following the break-before-make
+        * (pmd_clear() followed by tlb_flush()) process can
+        * hinder forward progress due to refaults generated
+        * on missing translations.
+        *
+        * Skip updating the page table if the entry is
+        * unchanged.
+        */
+       if (pmd_val(old_pmd) == pmd_val(*new_pmd))
+               return 0;
+
+       if (pmd_present(old_pmd)) {
+               /*
+                * If we already have PTE level mapping for this block,
+                * we must unmap it to avoid inconsistent TLB state and
+                * leaking the table page. We could end up in this situation
+                * if the memory slot was marked for dirty logging and was
+                * reverted, leaving PTE level mappings for the pages accessed
+                * during the period. So, unmap the PTE level mapping for this
+                * block and retry, as we could have released the upper level
+                * table in the process.
+                *
+                * Normal THP split/merge follows mmu_notifier callbacks and do
+                * get handled accordingly.
+                */
+               if (!pmd_thp_or_huge(old_pmd)) {
+                       unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
+                       goto retry;
+               }
+               /*
+                * Mapping in huge pages should only happen through a
+                * fault.  If a page is merged into a transparent huge
+                * page, the individual subpages of that huge page
+                * should be unmapped through MMU notifiers before we
+                * get here.
+                *
+                * Merging of CompoundPages is not supported; they
+                * should become splitting first, unmapped, merged,
+                * and mapped back in on-demand.
+                */
+               WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
+               pmd_clear(pmd);
+               kvm_tlb_flush_vmid_ipa(kvm, addr);
+       } else {
+               get_page(virt_to_page(pmd));
+       }
+
+       kvm_set_pmd(pmd, *new_pmd);
+       return 0;
+}
+
+static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+                              phys_addr_t addr, const pud_t *new_pudp)
+{
+       pud_t *pudp, old_pud;
+
+retry:
+       pudp = stage2_get_pud(kvm, cache, addr);
+       VM_BUG_ON(!pudp);
+
+       old_pud = *pudp;
+
+       /*
+        * A large number of vcpus faulting on the same stage 2 entry,
+        * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
+        * Skip updating the page tables if there is no change.
+        */
+       if (pud_val(old_pud) == pud_val(*new_pudp))
+               return 0;
+
+       if (stage2_pud_present(kvm, old_pud)) {
+               /*
+                * If we already have table level mapping for this block, unmap
+                * the range for this block and retry.
+                */
+               if (!stage2_pud_huge(kvm, old_pud)) {
+                       unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
+                       goto retry;
+               }
+
+               WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
+               stage2_pud_clear(kvm, pudp);
+               kvm_tlb_flush_vmid_ipa(kvm, addr);
+       } else {
+               get_page(virt_to_page(pudp));
+       }
+
+       kvm_set_pud(pudp, *new_pudp);
+       return 0;
+}
+
+/*
+ * stage2_get_leaf_entry - walk the stage2 VM page tables and return
+ * true if a valid and present leaf-entry is found. A pointer to the
+ * leaf-entry is returned in the appropriate level variable - pudpp,
+ * pmdpp, ptepp.
+ */
+static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
+                                 pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
+{
+       pud_t *pudp;
+       pmd_t *pmdp;
+       pte_t *ptep;
+
+       *pudpp = NULL;
+       *pmdpp = NULL;
+       *ptepp = NULL;
+
+       pudp = stage2_get_pud(kvm, NULL, addr);
+       if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
+               return false;
+
+       if (stage2_pud_huge(kvm, *pudp)) {
+               *pudpp = pudp;
+               return true;
+       }
+
+       pmdp = stage2_pmd_offset(kvm, pudp, addr);
+       if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
+               return false;
+
+       if (pmd_thp_or_huge(*pmdp)) {
+               *pmdpp = pmdp;
+               return true;
+       }
+
+       ptep = pte_offset_kernel(pmdp, addr);
+       if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
+               return false;
+
+       *ptepp = ptep;
+       return true;
+}
+
+static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+{
+       pud_t *pudp;
+       pmd_t *pmdp;
+       pte_t *ptep;
+       bool found;
+
+       found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
+       if (!found)
+               return false;
+
+       if (pudp)
+               return kvm_s2pud_exec(pudp);
+       else if (pmdp)
+               return kvm_s2pmd_exec(pmdp);
+       else
+               return kvm_s2pte_exec(ptep);
+}
+
+static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+                         phys_addr_t addr, const pte_t *new_pte,
+                         unsigned long flags)
+{
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte, old_pte;
+       bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
+       bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
+
+       VM_BUG_ON(logging_active && !cache);
+
+       /* Create stage-2 page table mapping - Levels 0 and 1 */
+       pud = stage2_get_pud(kvm, cache, addr);
+       if (!pud) {
+               /*
+                * Ignore calls from kvm_set_spte_hva for unallocated
+                * address ranges.
+                */
+               return 0;
+       }
+
+       /*
+        * While dirty page logging - dissolve huge PUD, then continue
+        * on to allocate page.
+        */
+       if (logging_active)
+               stage2_dissolve_pud(kvm, addr, pud);
+
+       if (stage2_pud_none(kvm, *pud)) {
+               if (!cache)
+                       return 0; /* ignore calls from kvm_set_spte_hva */
+               pmd = mmu_memory_cache_alloc(cache);
+               stage2_pud_populate(kvm, pud, pmd);
+               get_page(virt_to_page(pud));
+       }
+
+       pmd = stage2_pmd_offset(kvm, pud, addr);
+       if (!pmd) {
+               /*
+                * Ignore calls from kvm_set_spte_hva for unallocated
+                * address ranges.
+                */
+               return 0;
+       }
+
+       /*
+        * While dirty page logging - dissolve huge PMD, then continue on to
+        * allocate page.
+        */
+       if (logging_active)
+               stage2_dissolve_pmd(kvm, addr, pmd);
+
+       /* Create stage-2 page mappings - Level 2 */
+       if (pmd_none(*pmd)) {
+               if (!cache)
+                       return 0; /* ignore calls from kvm_set_spte_hva */
+               pte = mmu_memory_cache_alloc(cache);
+               kvm_pmd_populate(pmd, pte);
+               get_page(virt_to_page(pmd));
+       }
+
+       pte = pte_offset_kernel(pmd, addr);
+
+       if (iomap && pte_present(*pte))
+               return -EFAULT;
+
+       /* Create 2nd stage page table mapping - Level 3 */
+       old_pte = *pte;
+       if (pte_present(old_pte)) {
+               /* Skip page table update if there is no change */
+               if (pte_val(old_pte) == pte_val(*new_pte))
+                       return 0;
+
+               kvm_set_pte(pte, __pte(0));
+               kvm_tlb_flush_vmid_ipa(kvm, addr);
+       } else {
+               get_page(virt_to_page(pte));
+       }
+
+       kvm_set_pte(pte, *new_pte);
+       return 0;
+}
+
+#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static int stage2_ptep_test_and_clear_young(pte_t *pte)
+{
+       if (pte_young(*pte)) {
+               *pte = pte_mkold(*pte);
+               return 1;
+       }
+       return 0;
+}
+#else
+static int stage2_ptep_test_and_clear_young(pte_t *pte)
+{
+       return __ptep_test_and_clear_young(pte);
+}
+#endif
+
+static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
+{
+       return stage2_ptep_test_and_clear_young((pte_t *)pmd);
+}
+
+static int stage2_pudp_test_and_clear_young(pud_t *pud)
+{
+       return stage2_ptep_test_and_clear_young((pte_t *)pud);
+}
+
+/**
+ * kvm_phys_addr_ioremap - map a device range to guest IPA
+ *
+ * @kvm:       The KVM pointer
+ * @guest_ipa: The IPA at which to insert the mapping
+ * @pa:                The physical address of the device
+ * @size:      The size of the mapping
+ */
+int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
+                         phys_addr_t pa, unsigned long size, bool writable)
+{
+       phys_addr_t addr, end;
+       int ret = 0;
+       unsigned long pfn;
+       struct kvm_mmu_memory_cache cache = { 0, };
+
+       end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
+       pfn = __phys_to_pfn(pa);
+
+       for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
+               pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
+
+               if (writable)
+                       pte = kvm_s2pte_mkwrite(pte);
+
+               ret = mmu_topup_memory_cache(&cache,
+                                            kvm_mmu_cache_min_pages(kvm),
+                                            KVM_NR_MEM_OBJS);
+               if (ret)
+                       goto out;
+               spin_lock(&kvm->mmu_lock);
+               ret = stage2_set_pte(kvm, &cache, addr, &pte,
+                                               KVM_S2PTE_FLAG_IS_IOMAP);
+               spin_unlock(&kvm->mmu_lock);
+               if (ret)
+                       goto out;
+
+               pfn++;
+       }
+
+out:
+       mmu_free_memory_cache(&cache);
+       return ret;
+}
+
+static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
+{
+       kvm_pfn_t pfn = *pfnp;
+       gfn_t gfn = *ipap >> PAGE_SHIFT;
+
+       if (kvm_is_transparent_hugepage(pfn)) {
+               unsigned long mask;
+               /*
+                * The address we faulted on is backed by a transparent huge
+                * page.  However, because we map the compound huge page and
+                * not the individual tail page, we need to transfer the
+                * refcount to the head page.  We have to be careful that the
+                * THP doesn't start to split while we are adjusting the
+                * refcounts.
+                *
+                * We are sure this doesn't happen, because mmu_notifier_retry
+                * was successful and we are holding the mmu_lock, so if this
+                * THP is trying to split, it will be blocked in the mmu
+                * notifier before touching any of the pages, specifically
+                * before being able to call __split_huge_page_refcount().
+                *
+                * We can therefore safely transfer the refcount from PG_tail
+                * to PG_head and switch the pfn from a tail page to the head
+                * page accordingly.
+                */
+               mask = PTRS_PER_PMD - 1;
+               VM_BUG_ON((gfn & mask) != (pfn & mask));
+               if (pfn & mask) {
+                       *ipap &= PMD_MASK;
+                       kvm_release_pfn_clean(pfn);
+                       pfn &= ~mask;
+                       kvm_get_pfn(pfn);
+                       *pfnp = pfn;
+               }
+
+               return true;
+       }
+
+       return false;
+}
+
+/**
+ * stage2_wp_ptes - write protect PMD range
+ * @pmd:       pointer to pmd entry
+ * @addr:      range start address
+ * @end:       range end address
+ */
+static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+       pte_t *pte;
+
+       pte = pte_offset_kernel(pmd, addr);
+       do {
+               if (!pte_none(*pte)) {
+                       if (!kvm_s2pte_readonly(pte))
+                               kvm_set_s2pte_readonly(pte);
+               }
+       } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+/**
+ * stage2_wp_pmds - write protect PUD range
+ * kvm:                kvm instance for the VM
+ * @pud:       pointer to pud entry
+ * @addr:      range start address
+ * @end:       range end address
+ */
+static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
+                          phys_addr_t addr, phys_addr_t end)
+{
+       pmd_t *pmd;
+       phys_addr_t next;
+
+       pmd = stage2_pmd_offset(kvm, pud, addr);
+
+       do {
+               next = stage2_pmd_addr_end(kvm, addr, end);
+               if (!pmd_none(*pmd)) {
+                       if (pmd_thp_or_huge(*pmd)) {
+                               if (!kvm_s2pmd_readonly(pmd))
+                                       kvm_set_s2pmd_readonly(pmd);
+                       } else {
+                               stage2_wp_ptes(pmd, addr, next);
+                       }
+               }
+       } while (pmd++, addr = next, addr != end);
+}
+
+/**
+ * stage2_wp_puds - write protect PGD range
+ * @pgd:       pointer to pgd entry
+ * @addr:      range start address
+ * @end:       range end address
+ */
+static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
+                           phys_addr_t addr, phys_addr_t end)
+{
+       pud_t *pud;
+       phys_addr_t next;
+
+       pud = stage2_pud_offset(kvm, pgd, addr);
+       do {
+               next = stage2_pud_addr_end(kvm, addr, end);
+               if (!stage2_pud_none(kvm, *pud)) {
+                       if (stage2_pud_huge(kvm, *pud)) {
+                               if (!kvm_s2pud_readonly(pud))
+                                       kvm_set_s2pud_readonly(pud);
+                       } else {
+                               stage2_wp_pmds(kvm, pud, addr, next);
+                       }
+               }
+       } while (pud++, addr = next, addr != end);
+}
+
+/**
+ * stage2_wp_range() - write protect stage2 memory region range
+ * @kvm:       The KVM pointer
+ * @addr:      Start address of range
+ * @end:       End address of range
+ */
+static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+       pgd_t *pgd;
+       phys_addr_t next;
+
+       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+       do {
+               /*
+                * Release kvm_mmu_lock periodically if the memory region is
+                * large. Otherwise, we may see kernel panics with
+                * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
+                * CONFIG_LOCKDEP. Additionally, holding the lock too long
+                * will also starve other vCPUs. We have to also make sure
+                * that the page tables are not freed while we released
+                * the lock.
+                */
+               cond_resched_lock(&kvm->mmu_lock);
+               if (!READ_ONCE(kvm->arch.pgd))
+                       break;
+               next = stage2_pgd_addr_end(kvm, addr, end);
+               if (stage2_pgd_present(kvm, *pgd))
+                       stage2_wp_puds(kvm, pgd, addr, next);
+       } while (pgd++, addr = next, addr != end);
+}
+
+/**
+ * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
+ * @kvm:       The KVM pointer
+ * @slot:      The memory slot to write protect
+ *
+ * Called to start logging dirty pages after memory region
+ * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
+ * all present PUD, PMD and PTEs are write protected in the memory region.
+ * Afterwards read of dirty page log can be called.
+ *
+ * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
+ */
+void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
+{
+       struct kvm_memslots *slots = kvm_memslots(kvm);
+       struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
+       phys_addr_t start, end;
+
+       if (WARN_ON_ONCE(!memslot))
+               return;
+
+       start = memslot->base_gfn << PAGE_SHIFT;
+       end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+       spin_lock(&kvm->mmu_lock);
+       stage2_wp_range(kvm, start, end);
+       spin_unlock(&kvm->mmu_lock);
+       kvm_flush_remote_tlbs(kvm);
+}
+
+/**
+ * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
+ * @kvm:       The KVM pointer
+ * @slot:      The memory slot associated with mask
+ * @gfn_offset:        The gfn offset in memory slot
+ * @mask:      The mask of dirty pages at offset 'gfn_offset' in this memory
+ *             slot to be write protected
+ *
+ * Walks bits set in mask write protects the associated pte's. Caller must
+ * acquire kvm_mmu_lock.
+ */
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+               struct kvm_memory_slot *slot,
+               gfn_t gfn_offset, unsigned long mask)
+{
+       phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+       phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
+       phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+       stage2_wp_range(kvm, start, end);
+}
+
+/*
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * dirty pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+               struct kvm_memory_slot *slot,
+               gfn_t gfn_offset, unsigned long mask)
+{
+       kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
+static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
+{
+       __clean_dcache_guest_page(pfn, size);
+}
+
+static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
+{
+       __invalidate_icache_guest_page(pfn, size);
+}
+
+static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
+{
+       send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
+}
+
+static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
+                                              unsigned long hva,
+                                              unsigned long map_size)
+{
+       gpa_t gpa_start;
+       hva_t uaddr_start, uaddr_end;
+       size_t size;
+
+       size = memslot->npages * PAGE_SIZE;
+
+       gpa_start = memslot->base_gfn << PAGE_SHIFT;
+
+       uaddr_start = memslot->userspace_addr;
+       uaddr_end = uaddr_start + size;
+
+       /*
+        * Pages belonging to memslots that don't have the same alignment
+        * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
+        * PMD/PUD entries, because we'll end up mapping the wrong pages.
+        *
+        * Consider a layout like the following:
+        *
+        *    memslot->userspace_addr:
+        *    +-----+--------------------+--------------------+---+
+        *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
+        *    +-----+--------------------+--------------------+---+
+        *
+        *    memslot->base_gfn << PAGE_SIZE:
+        *      +---+--------------------+--------------------+-----+
+        *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
+        *      +---+--------------------+--------------------+-----+
+        *
+        * If we create those stage-2 blocks, we'll end up with this incorrect
+        * mapping:
+        *   d -> f
+        *   e -> g
+        *   f -> h
+        */
+       if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
+               return false;
+
+       /*
+        * Next, let's make sure we're not trying to map anything not covered
+        * by the memslot. This means we have to prohibit block size mappings
+        * for the beginning and end of a non-block aligned and non-block sized
+        * memory slot (illustrated by the head and tail parts of the
+        * userspace view above containing pages 'abcde' and 'xyz',
+        * respectively).
+        *
+        * Note that it doesn't matter if we do the check using the
+        * userspace_addr or the base_gfn, as both are equally aligned (per
+        * the check above) and equally sized.
+        */
+       return (hva & ~(map_size - 1)) >= uaddr_start &&
+              (hva & ~(map_size - 1)) + map_size <= uaddr_end;
+}
+
+static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+                         struct kvm_memory_slot *memslot, unsigned long hva,
+                         unsigned long fault_status)
+{
+       int ret;
+       bool write_fault, writable, force_pte = false;
+       bool exec_fault, needs_exec;
+       unsigned long mmu_seq;
+       gfn_t gfn = fault_ipa >> PAGE_SHIFT;
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+       struct vm_area_struct *vma;
+       short vma_shift;
+       kvm_pfn_t pfn;
+       pgprot_t mem_type = PAGE_S2;
+       bool logging_active = memslot_is_logging(memslot);
+       unsigned long vma_pagesize, flags = 0;
+
+       write_fault = kvm_is_write_fault(vcpu);
+       exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
+       VM_BUG_ON(write_fault && exec_fault);
+
+       if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
+               kvm_err("Unexpected L2 read permission error\n");
+               return -EFAULT;
+       }
+
+       /* Let's check if we will get back a huge page backed by hugetlbfs */
+       down_read(&current->mm->mmap_sem);
+       vma = find_vma_intersection(current->mm, hva, hva + 1);
+       if (unlikely(!vma)) {
+               kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
+               up_read(&current->mm->mmap_sem);
+               return -EFAULT;
+       }
+
+       if (is_vm_hugetlb_page(vma))
+               vma_shift = huge_page_shift(hstate_vma(vma));
+       else
+               vma_shift = PAGE_SHIFT;
+
+       vma_pagesize = 1ULL << vma_shift;
+       if (logging_active ||
+           (vma->vm_flags & VM_PFNMAP) ||
+           !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
+               force_pte = true;
+               vma_pagesize = PAGE_SIZE;
+       }
+
+       /*
+        * The stage2 has a minimum of 2 level table (For arm64 see
+        * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
+        * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
+        * As for PUD huge maps, we must make sure that we have at least
+        * 3 levels, i.e, PMD is not folded.
+        */
+       if (vma_pagesize == PMD_SIZE ||
+           (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
+               gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
+       up_read(&current->mm->mmap_sem);
+
+       /* We need minimum second+third level pages */
+       ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
+                                    KVM_NR_MEM_OBJS);
+       if (ret)
+               return ret;
+
+       mmu_seq = vcpu->kvm->mmu_notifier_seq;
+       /*
+        * Ensure the read of mmu_notifier_seq happens before we call
+        * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
+        * the page we just got a reference to gets unmapped before we have a
+        * chance to grab the mmu_lock, which ensure that if the page gets
+        * unmapped afterwards, the call to kvm_unmap_hva will take it away
+        * from us again properly. This smp_rmb() interacts with the smp_wmb()
+        * in kvm_mmu_notifier_invalidate_<page|range_end>.
+        */
+       smp_rmb();
+
+       pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+       if (pfn == KVM_PFN_ERR_HWPOISON) {
+               kvm_send_hwpoison_signal(hva, vma_shift);
+               return 0;
+       }
+       if (is_error_noslot_pfn(pfn))
+               return -EFAULT;
+
+       if (kvm_is_device_pfn(pfn)) {
+               mem_type = PAGE_S2_DEVICE;
+               flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+       } else if (logging_active) {
+               /*
+                * Faults on pages in a memslot with logging enabled
+                * should not be mapped with huge pages (it introduces churn
+                * and performance degradation), so force a pte mapping.
+                */
+               flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+               /*
+                * Only actually map the page as writable if this was a write
+                * fault.
+                */
+               if (!write_fault)
+                       writable = false;
+       }
+
+       if (exec_fault && is_iomap(flags))
+               return -ENOEXEC;
+
+       spin_lock(&kvm->mmu_lock);
+       if (mmu_notifier_retry(kvm, mmu_seq))
+               goto out_unlock;
+
+       if (vma_pagesize == PAGE_SIZE && !force_pte) {
+               /*
+                * Only PMD_SIZE transparent hugepages(THP) are
+                * currently supported. This code will need to be
+                * updated to support other THP sizes.
+                *
+                * Make sure the host VA and the guest IPA are sufficiently
+                * aligned and that the block is contained within the memslot.
+                */
+               if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
+                   transparent_hugepage_adjust(&pfn, &fault_ipa))
+                       vma_pagesize = PMD_SIZE;
+       }
+
+       if (writable)
+               kvm_set_pfn_dirty(pfn);
+
+       if (fault_status != FSC_PERM && !is_iomap(flags))
+               clean_dcache_guest_page(pfn, vma_pagesize);
+
+       if (exec_fault)
+               invalidate_icache_guest_page(pfn, vma_pagesize);
+
+       /*
+        * If we took an execution fault we have made the
+        * icache/dcache coherent above and should now let the s2
+        * mapping be executable.
+        *
+        * Write faults (!exec_fault && FSC_PERM) are orthogonal to
+        * execute permissions, and we preserve whatever we have.
+        */
+       needs_exec = exec_fault ||
+               (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
+
+       if (vma_pagesize == PUD_SIZE) {
+               pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
+
+               new_pud = kvm_pud_mkhuge(new_pud);
+               if (writable)
+                       new_pud = kvm_s2pud_mkwrite(new_pud);
+
+               if (needs_exec)
+                       new_pud = kvm_s2pud_mkexec(new_pud);
+
+               ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
+       } else if (vma_pagesize == PMD_SIZE) {
+               pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
+
+               new_pmd = kvm_pmd_mkhuge(new_pmd);
+
+               if (writable)
+                       new_pmd = kvm_s2pmd_mkwrite(new_pmd);
+
+               if (needs_exec)
+                       new_pmd = kvm_s2pmd_mkexec(new_pmd);
+
+               ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
+       } else {
+               pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
+
+               if (writable) {
+                       new_pte = kvm_s2pte_mkwrite(new_pte);
+                       mark_page_dirty(kvm, gfn);
+               }
+
+               if (needs_exec)
+                       new_pte = kvm_s2pte_mkexec(new_pte);
+
+               ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
+       }
+
+out_unlock:
+       spin_unlock(&kvm->mmu_lock);
+       kvm_set_pfn_accessed(pfn);
+       kvm_release_pfn_clean(pfn);
+       return ret;
+}
+
+/*
+ * Resolve the access fault by making the page young again.
+ * Note that because the faulting entry is guaranteed not to be
+ * cached in the TLB, we don't need to invalidate anything.
+ * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
+ * so there is no need for atomic (pte|pmd)_mkyoung operations.
+ */
+static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
+{
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       kvm_pfn_t pfn;
+       bool pfn_valid = false;
+
+       trace_kvm_access_fault(fault_ipa);
+
+       spin_lock(&vcpu->kvm->mmu_lock);
+
+       if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
+               goto out;
+
+       if (pud) {              /* HugeTLB */
+               *pud = kvm_s2pud_mkyoung(*pud);
+               pfn = kvm_pud_pfn(*pud);
+               pfn_valid = true;
+       } else  if (pmd) {      /* THP, HugeTLB */
+               *pmd = pmd_mkyoung(*pmd);
+               pfn = pmd_pfn(*pmd);
+               pfn_valid = true;
+       } else {
+               *pte = pte_mkyoung(*pte);       /* Just a page... */
+               pfn = pte_pfn(*pte);
+               pfn_valid = true;
+       }
+
+out:
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       if (pfn_valid)
+               kvm_set_pfn_accessed(pfn);
+}
+
+/**
+ * kvm_handle_guest_abort - handles all 2nd stage aborts
+ * @vcpu:      the VCPU pointer
+ * @run:       the kvm_run structure
+ *
+ * Any abort that gets to the host is almost guaranteed to be caused by a
+ * missing second stage translation table entry, which can mean that either the
+ * guest simply needs more memory and we must allocate an appropriate page or it
+ * can mean that the guest tried to access I/O memory, which is emulated by user
+ * space. The distinction is based on the IPA causing the fault and whether this
+ * memory region has been registered as standard RAM by user space.
+ */
+int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+       unsigned long fault_status;
+       phys_addr_t fault_ipa;
+       struct kvm_memory_slot *memslot;
+       unsigned long hva;
+       bool is_iabt, write_fault, writable;
+       gfn_t gfn;
+       int ret, idx;
+
+       fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
+
+       fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+       is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
+
+       /* Synchronous External Abort? */
+       if (kvm_vcpu_dabt_isextabt(vcpu)) {
+               /*
+                * For RAS the host kernel may handle this abort.
+                * There is no need to pass the error into the guest.
+                */
+               if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
+                       return 1;
+
+               if (unlikely(!is_iabt)) {
+                       kvm_inject_vabt(vcpu);
+                       return 1;
+               }
+       }
+
+       trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
+                             kvm_vcpu_get_hfar(vcpu), fault_ipa);
+
+       /* Check the stage-2 fault is trans. fault or write fault */
+       if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
+           fault_status != FSC_ACCESS) {
+               kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
+                       kvm_vcpu_trap_get_class(vcpu),
+                       (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
+                       (unsigned long)kvm_vcpu_get_hsr(vcpu));
+               return -EFAULT;
+       }
+
+       idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+       gfn = fault_ipa >> PAGE_SHIFT;
+       memslot = gfn_to_memslot(vcpu->kvm, gfn);
+       hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
+       write_fault = kvm_is_write_fault(vcpu);
+       if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
+               if (is_iabt) {
+                       /* Prefetch Abort on I/O address */
+                       ret = -ENOEXEC;
+                       goto out;
+               }
+
+               /*
+                * Check for a cache maintenance operation. Since we
+                * ended-up here, we know it is outside of any memory
+                * slot. But we can't find out if that is for a device,
+                * or if the guest is just being stupid. The only thing
+                * we know for sure is that this range cannot be cached.
+                *
+                * So let's assume that the guest is just being
+                * cautious, and skip the instruction.
+                */
+               if (kvm_vcpu_dabt_is_cm(vcpu)) {
+                       kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+                       ret = 1;
+                       goto out_unlock;
+               }
+
+               /*
+                * The IPA is reported as [MAX:12], so we need to
+                * complement it with the bottom 12 bits from the
+                * faulting VA. This is always 12 bits, irrespective
+                * of the page size.
+                */
+               fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
+               ret = io_mem_abort(vcpu, run, fault_ipa);
+               goto out_unlock;
+       }
+
+       /* Userspace should not be able to register out-of-bounds IPAs */
+       VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
+
+       if (fault_status == FSC_ACCESS) {
+               handle_access_fault(vcpu, fault_ipa);
+               ret = 1;
+               goto out_unlock;
+       }
+
+       ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+       if (ret == 0)
+               ret = 1;
+out:
+       if (ret == -ENOEXEC) {
+               kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+               ret = 1;
+       }
+out_unlock:
+       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+       return ret;
+}
+
+static int handle_hva_to_gpa(struct kvm *kvm,
+                            unsigned long start,
+                            unsigned long end,
+                            int (*handler)(struct kvm *kvm,
+                                           gpa_t gpa, u64 size,
+                                           void *data),
+                            void *data)
+{
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *memslot;
+       int ret = 0;
+
+       slots = kvm_memslots(kvm);
+
+       /* we only care about the pages that the guest sees */
+       kvm_for_each_memslot(memslot, slots) {
+               unsigned long hva_start, hva_end;
+               gfn_t gpa;
+
+               hva_start = max(start, memslot->userspace_addr);
+               hva_end = min(end, memslot->userspace_addr +
+                                       (memslot->npages << PAGE_SHIFT));
+               if (hva_start >= hva_end)
+                       continue;
+
+               gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
+               ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
+       }
+
+       return ret;
+}
+
+static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+       unmap_stage2_range(kvm, gpa, size);
+       return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm,
+                       unsigned long start, unsigned long end)
+{
+       if (!kvm->arch.pgd)
+               return 0;
+
+       trace_kvm_unmap_hva_range(start, end);
+       handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
+       return 0;
+}
+
+static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+       pte_t *pte = (pte_t *)data;
+
+       WARN_ON(size != PAGE_SIZE);
+       /*
+        * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
+        * flag clear because MMU notifiers will have unmapped a huge PMD before
+        * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+        * therefore stage2_set_pte() never needs to clear out a huge PMD
+        * through this calling path.
+        */
+       stage2_set_pte(kvm, NULL, gpa, pte, 0);
+       return 0;
+}
+
+
+int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+       unsigned long end = hva + PAGE_SIZE;
+       kvm_pfn_t pfn = pte_pfn(pte);
+       pte_t stage2_pte;
+
+       if (!kvm->arch.pgd)
+               return 0;
+
+       trace_kvm_set_spte_hva(hva);
+
+       /*
+        * We've moved a page around, probably through CoW, so let's treat it
+        * just like a translation fault and clean the cache to the PoC.
+        */
+       clean_dcache_guest_page(pfn, PAGE_SIZE);
+       stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
+       handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
+
+       return 0;
+}
+
+static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+       if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
+               return 0;
+
+       if (pud)
+               return stage2_pudp_test_and_clear_young(pud);
+       else if (pmd)
+               return stage2_pmdp_test_and_clear_young(pmd);
+       else
+               return stage2_ptep_test_and_clear_young(pte);
+}
+
+static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+       if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
+               return 0;
+
+       if (pud)
+               return kvm_s2pud_young(*pud);
+       else if (pmd)
+               return pmd_young(*pmd);
+       else
+               return pte_young(*pte);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+       if (!kvm->arch.pgd)
+               return 0;
+       trace_kvm_age_hva(start, end);
+       return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       if (!kvm->arch.pgd)
+               return 0;
+       trace_kvm_test_age_hva(hva);
+       return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
+                                kvm_test_age_hva_handler, NULL);
+}
+
+void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+       mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+}
+
+phys_addr_t kvm_mmu_get_httbr(void)
+{
+       if (__kvm_cpu_uses_extended_idmap())
+               return virt_to_phys(merged_hyp_pgd);
+       else
+               return virt_to_phys(hyp_pgd);
+}
+
+phys_addr_t kvm_get_idmap_vector(void)
+{
+       return hyp_idmap_vector;
+}
+
+static int kvm_map_idmap_text(pgd_t *pgd)
+{
+       int err;
+
+       /* Create the idmap in the boot page tables */
+       err =   __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
+                                     hyp_idmap_start, hyp_idmap_end,
+                                     __phys_to_pfn(hyp_idmap_start),
+                                     PAGE_HYP_EXEC);
+       if (err)
+               kvm_err("Failed to idmap %lx-%lx\n",
+                       hyp_idmap_start, hyp_idmap_end);
+
+       return err;
+}
+
+int kvm_mmu_init(void)
+{
+       int err;
+
+       hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
+       hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
+       hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
+       hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
+       hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
+
+       /*
+        * We rely on the linker script to ensure at build time that the HYP
+        * init code does not cross a page boundary.
+        */
+       BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
+
+       kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
+       kvm_debug("HYP VA range: %lx:%lx\n",
+                 kern_hyp_va(PAGE_OFFSET),
+                 kern_hyp_va((unsigned long)high_memory - 1));
+
+       if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
+           hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
+           hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
+               /*
+                * The idmap page is intersecting with the VA space,
+                * it is not safe to continue further.
+                */
+               kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+       hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
+       if (!hyp_pgd) {
+               kvm_err("Hyp mode PGD not allocated\n");
+               err = -ENOMEM;
+               goto out;
+       }
+
+       if (__kvm_cpu_uses_extended_idmap()) {
+               boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                                        hyp_pgd_order);
+               if (!boot_hyp_pgd) {
+                       kvm_err("Hyp boot PGD not allocated\n");
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               err = kvm_map_idmap_text(boot_hyp_pgd);
+               if (err)
+                       goto out;
+
+               merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+               if (!merged_hyp_pgd) {
+                       kvm_err("Failed to allocate extra HYP pgd\n");
+                       goto out;
+               }
+               __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
+                                   hyp_idmap_start);
+       } else {
+               err = kvm_map_idmap_text(hyp_pgd);
+               if (err)
+                       goto out;
+       }
+
+       io_map_base = hyp_idmap_start;
+       return 0;
+out:
+       free_hyp_pgds();
+       return err;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+                                  const struct kvm_userspace_memory_region *mem,
+                                  struct kvm_memory_slot *old,
+                                  const struct kvm_memory_slot *new,
+                                  enum kvm_mr_change change)
+{
+       /*
+        * At this point memslot has been committed and there is an
+        * allocated dirty_bitmap[], dirty pages will be be tracked while the
+        * memory slot is write protected.
+        */
+       if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
+               kvm_mmu_wp_memory_region(kvm, mem->slot);
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+                                  struct kvm_memory_slot *memslot,
+                                  const struct kvm_userspace_memory_region *mem,
+                                  enum kvm_mr_change change)
+{
+       hva_t hva = mem->userspace_addr;
+       hva_t reg_end = hva + mem->memory_size;
+       bool writable = !(mem->flags & KVM_MEM_READONLY);
+       int ret = 0;
+
+       if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
+                       change != KVM_MR_FLAGS_ONLY)
+               return 0;
+
+       /*
+        * Prevent userspace from creating a memory region outside of the IPA
+        * space addressable by the KVM guest IPA space.
+        */
+       if (memslot->base_gfn + memslot->npages >=
+           (kvm_phys_size(kvm) >> PAGE_SHIFT))
+               return -EFAULT;
+
+       down_read(&current->mm->mmap_sem);
+       /*
+        * A memory region could potentially cover multiple VMAs, and any holes
+        * between them, so iterate over all of them to find out if we can map
+        * any of them right now.
+        *
+        *     +--------------------------------------------+
+        * +---------------+----------------+   +----------------+
+        * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
+        * +---------------+----------------+   +----------------+
+        *     |               memory region                |
+        *     +--------------------------------------------+
+        */
+       do {
+               struct vm_area_struct *vma = find_vma(current->mm, hva);
+               hva_t vm_start, vm_end;
+
+               if (!vma || vma->vm_start >= reg_end)
+                       break;
+
+               /*
+                * Take the intersection of this VMA with the memory region
+                */
+               vm_start = max(hva, vma->vm_start);
+               vm_end = min(reg_end, vma->vm_end);
+
+               if (vma->vm_flags & VM_PFNMAP) {
+                       gpa_t gpa = mem->guest_phys_addr +
+                                   (vm_start - mem->userspace_addr);
+                       phys_addr_t pa;
+
+                       pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
+                       pa += vm_start - vma->vm_start;
+
+                       /* IO region dirty page logging not allowed */
+                       if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+
+                       ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
+                                                   vm_end - vm_start,
+                                                   writable);
+                       if (ret)
+                               break;
+               }
+               hva = vm_end;
+       } while (hva < reg_end);
+
+       if (change == KVM_MR_FLAGS_ONLY)
+               goto out;
+
+       spin_lock(&kvm->mmu_lock);
+       if (ret)
+               unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
+       else
+               stage2_flush_memslot(kvm, memslot);
+       spin_unlock(&kvm->mmu_lock);
+out:
+       up_read(&current->mm->mmap_sem);
+       return ret;
+}
+
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+}
+
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
+{
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+       kvm_free_stage2_pgd(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+                                  struct kvm_memory_slot *slot)
+{
+       gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
+       phys_addr_t size = slot->npages << PAGE_SHIFT;
+
+       spin_lock(&kvm->mmu_lock);
+       unmap_stage2_range(kvm, gpa, size);
+       spin_unlock(&kvm->mmu_lock);
+}
+
+/*
+ * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
+ *
+ * Main problems:
+ * - S/W ops are local to a CPU (not broadcast)
+ * - We have line migration behind our back (speculation)
+ * - System caches don't support S/W at all (damn!)
+ *
+ * In the face of the above, the best we can do is to try and convert
+ * S/W ops to VA ops. Because the guest is not allowed to infer the
+ * S/W to PA mapping, it can only use S/W to nuke the whole cache,
+ * which is a rather good thing for us.
+ *
+ * Also, it is only used when turning caches on/off ("The expected
+ * usage of the cache maintenance instructions that operate by set/way
+ * is associated with the cache maintenance instructions associated
+ * with the powerdown and powerup of caches, if this is required by
+ * the implementation.").
+ *
+ * We use the following policy:
+ *
+ * - If we trap a S/W operation, we enable VM trapping to detect
+ *   caches being turned on/off, and do a full clean.
+ *
+ * - We flush the caches on both caches being turned on and off.
+ *
+ * - Once the caches are enabled, we stop trapping VM ops.
+ */
+void kvm_set_way_flush(struct kvm_vcpu *vcpu)
+{
+       unsigned long hcr = *vcpu_hcr(vcpu);
+
+       /*
+        * If this is the first time we do a S/W operation
+        * (i.e. HCR_TVM not set) flush the whole memory, and set the
+        * VM trapping.
+        *
+        * Otherwise, rely on the VM trapping to wait for the MMU +
+        * Caches to be turned off. At that point, we'll be able to
+        * clean the caches again.
+        */
+       if (!(hcr & HCR_TVM)) {
+               trace_kvm_set_way_flush(*vcpu_pc(vcpu),
+                                       vcpu_has_cache_enabled(vcpu));
+               stage2_flush_vm(vcpu->kvm);
+               *vcpu_hcr(vcpu) = hcr | HCR_TVM;
+       }
+}
+
+void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
+{
+       bool now_enabled = vcpu_has_cache_enabled(vcpu);
+
+       /*
+        * If switching the MMU+caches on, need to invalidate the caches.
+        * If switching it off, need to clean the caches.
+        * Clean + invalidate does the trick always.
+        */
+       if (now_enabled != was_enabled)
+               stage2_flush_vm(vcpu->kvm);
+
+       /* Caches are now on, stop trapping VM ops (until a S/W op) */
+       if (now_enabled)
+               *vcpu_hcr(vcpu) &= ~HCR_TVM;
+
+       trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
+}
diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c
new file mode 100644 (file)
index 0000000..d45b8b9
--- /dev/null
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Based on the x86 implementation.
+ *
+ * Copyright (C) 2012 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/perf_event.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+
+static int kvm_is_in_guest(void)
+{
+        return kvm_get_running_vcpu() != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+       struct kvm_vcpu *vcpu;
+
+       vcpu = kvm_get_running_vcpu();
+
+       if (vcpu)
+               return !vcpu_mode_priv(vcpu);
+
+       return 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+       struct kvm_vcpu *vcpu;
+
+       vcpu = kvm_get_running_vcpu();
+
+       if (vcpu)
+               return *vcpu_pc(vcpu);
+
+       return 0;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+       .is_in_guest    = kvm_is_in_guest,
+       .is_user_mode   = kvm_is_user_mode,
+       .get_guest_ip   = kvm_get_guest_ip,
+};
+
+int kvm_perf_init(void)
+{
+       return perf_register_guest_info_callbacks(&kvm_guest_cbs);
+}
+
+int kvm_perf_teardown(void)
+{
+       return perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
new file mode 100644 (file)
index 0000000..f0d0312
--- /dev/null
@@ -0,0 +1,869 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Shannon Zhao <shannon.zhao@linaro.org>
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <linux/perf/arm_pmu.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_emulate.h>
+#include <kvm/arm_pmu.h>
+#include <kvm/arm_vgic.h>
+
+static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx);
+static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx);
+static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
+
+#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
+
+/**
+ * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ */
+static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+       return (select_idx == ARMV8_PMU_CYCLE_IDX &&
+               __vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC);
+}
+
+static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc)
+{
+       struct kvm_pmu *pmu;
+       struct kvm_vcpu_arch *vcpu_arch;
+
+       pmc -= pmc->idx;
+       pmu = container_of(pmc, struct kvm_pmu, pmc[0]);
+       vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu);
+       return container_of(vcpu_arch, struct kvm_vcpu, arch);
+}
+
+/**
+ * kvm_pmu_pmc_is_chained - determine if the pmc is chained
+ * @pmc: The PMU counter pointer
+ */
+static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc)
+{
+       struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+
+       return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
+}
+
+/**
+ * kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter
+ * @select_idx: The counter index
+ */
+static bool kvm_pmu_idx_is_high_counter(u64 select_idx)
+{
+       return select_idx & 0x1;
+}
+
+/**
+ * kvm_pmu_get_canonical_pmc - obtain the canonical pmc
+ * @pmc: The PMU counter pointer
+ *
+ * When a pair of PMCs are chained together we use the low counter (canonical)
+ * to hold the underlying perf event.
+ */
+static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc)
+{
+       if (kvm_pmu_pmc_is_chained(pmc) &&
+           kvm_pmu_idx_is_high_counter(pmc->idx))
+               return pmc - 1;
+
+       return pmc;
+}
+static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc)
+{
+       if (kvm_pmu_idx_is_high_counter(pmc->idx))
+               return pmc - 1;
+       else
+               return pmc + 1;
+}
+
+/**
+ * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ */
+static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+       u64 eventsel, reg;
+
+       select_idx |= 0x1;
+
+       if (select_idx == ARMV8_PMU_CYCLE_IDX)
+               return false;
+
+       reg = PMEVTYPER0_EL0 + select_idx;
+       eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
+
+       return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
+}
+
+/**
+ * kvm_pmu_get_pair_counter_value - get PMU counter value
+ * @vcpu: The vcpu pointer
+ * @pmc: The PMU counter pointer
+ */
+static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu,
+                                         struct kvm_pmc *pmc)
+{
+       u64 counter, counter_high, reg, enabled, running;
+
+       if (kvm_pmu_pmc_is_chained(pmc)) {
+               pmc = kvm_pmu_get_canonical_pmc(pmc);
+               reg = PMEVCNTR0_EL0 + pmc->idx;
+
+               counter = __vcpu_sys_reg(vcpu, reg);
+               counter_high = __vcpu_sys_reg(vcpu, reg + 1);
+
+               counter = lower_32_bits(counter) | (counter_high << 32);
+       } else {
+               reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
+                     ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
+               counter = __vcpu_sys_reg(vcpu, reg);
+       }
+
+       /*
+        * The real counter value is equal to the value of counter register plus
+        * the value perf event counts.
+        */
+       if (pmc->perf_event)
+               counter += perf_event_read_value(pmc->perf_event, &enabled,
+                                                &running);
+
+       return counter;
+}
+
+/**
+ * kvm_pmu_get_counter_value - get PMU counter value
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ */
+u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+       u64 counter;
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       struct kvm_pmc *pmc = &pmu->pmc[select_idx];
+
+       counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
+
+       if (kvm_pmu_pmc_is_chained(pmc) &&
+           kvm_pmu_idx_is_high_counter(select_idx))
+               counter = upper_32_bits(counter);
+       else if (select_idx != ARMV8_PMU_CYCLE_IDX)
+               counter = lower_32_bits(counter);
+
+       return counter;
+}
+
+/**
+ * kvm_pmu_set_counter_value - set PMU counter value
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ * @val: The counter value
+ */
+void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
+{
+       u64 reg;
+
+       reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
+             ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
+       __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
+
+       /* Recreate the perf event to reflect the updated sample_period */
+       kvm_pmu_create_perf_event(vcpu, select_idx);
+}
+
+/**
+ * kvm_pmu_release_perf_event - remove the perf event
+ * @pmc: The PMU counter pointer
+ */
+static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
+{
+       pmc = kvm_pmu_get_canonical_pmc(pmc);
+       if (pmc->perf_event) {
+               perf_event_disable(pmc->perf_event);
+               perf_event_release_kernel(pmc->perf_event);
+               pmc->perf_event = NULL;
+       }
+}
+
+/**
+ * kvm_pmu_stop_counter - stop PMU counter
+ * @pmc: The PMU counter pointer
+ *
+ * If this counter has been configured to monitor some event, release it here.
+ */
+static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
+{
+       u64 counter, reg, val;
+
+       pmc = kvm_pmu_get_canonical_pmc(pmc);
+       if (!pmc->perf_event)
+               return;
+
+       counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
+
+       if (pmc->idx == ARMV8_PMU_CYCLE_IDX) {
+               reg = PMCCNTR_EL0;
+               val = counter;
+       } else {
+               reg = PMEVCNTR0_EL0 + pmc->idx;
+               val = lower_32_bits(counter);
+       }
+
+       __vcpu_sys_reg(vcpu, reg) = val;
+
+       if (kvm_pmu_pmc_is_chained(pmc))
+               __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter);
+
+       kvm_pmu_release_perf_event(pmc);
+}
+
+/**
+ * kvm_pmu_vcpu_init - assign pmu counter idx for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+       for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
+               pmu->pmc[i].idx = i;
+}
+
+/**
+ * kvm_pmu_vcpu_reset - reset pmu state for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+       unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       int i;
+
+       for_each_set_bit(i, &mask, 32)
+               kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]);
+
+       bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS);
+}
+
+/**
+ * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+       for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
+               kvm_pmu_release_perf_event(&pmu->pmc[i]);
+}
+
+u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
+{
+       u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT;
+
+       val &= ARMV8_PMU_PMCR_N_MASK;
+       if (val == 0)
+               return BIT(ARMV8_PMU_CYCLE_IDX);
+       else
+               return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX);
+}
+
+/**
+ * kvm_pmu_enable_counter_mask - enable selected PMU counters
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMCNTENSET register
+ *
+ * Call perf_event_enable to start counting the perf event
+ */
+void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
+{
+       int i;
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       struct kvm_pmc *pmc;
+
+       if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val)
+               return;
+
+       for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
+               if (!(val & BIT(i)))
+                       continue;
+
+               pmc = &pmu->pmc[i];
+
+               /* A change in the enable state may affect the chain state */
+               kvm_pmu_update_pmc_chained(vcpu, i);
+               kvm_pmu_create_perf_event(vcpu, i);
+
+               /* At this point, pmc must be the canonical */
+               if (pmc->perf_event) {
+                       perf_event_enable(pmc->perf_event);
+                       if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
+                               kvm_debug("fail to enable perf event\n");
+               }
+       }
+}
+
+/**
+ * kvm_pmu_disable_counter_mask - disable selected PMU counters
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMCNTENCLR register
+ *
+ * Call perf_event_disable to stop counting the perf event
+ */
+void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
+{
+       int i;
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       struct kvm_pmc *pmc;
+
+       if (!val)
+               return;
+
+       for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
+               if (!(val & BIT(i)))
+                       continue;
+
+               pmc = &pmu->pmc[i];
+
+               /* A change in the enable state may affect the chain state */
+               kvm_pmu_update_pmc_chained(vcpu, i);
+               kvm_pmu_create_perf_event(vcpu, i);
+
+               /* At this point, pmc must be the canonical */
+               if (pmc->perf_event)
+                       perf_event_disable(pmc->perf_event);
+       }
+}
+
+static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
+{
+       u64 reg = 0;
+
+       if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) {
+               reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
+               reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+               reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
+               reg &= kvm_pmu_valid_counter_mask(vcpu);
+       }
+
+       return reg;
+}
+
+static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       bool overflow;
+
+       if (!kvm_arm_pmu_v3_ready(vcpu))
+               return;
+
+       overflow = !!kvm_pmu_overflow_status(vcpu);
+       if (pmu->irq_level == overflow)
+               return;
+
+       pmu->irq_level = overflow;
+
+       if (likely(irqchip_in_kernel(vcpu->kvm))) {
+               int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+                                             pmu->irq_num, overflow, pmu);
+               WARN_ON(ret);
+       }
+}
+
+bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
+       bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU;
+
+       if (likely(irqchip_in_kernel(vcpu->kvm)))
+               return false;
+
+       return pmu->irq_level != run_level;
+}
+
+/*
+ * Reflect the PMU overflow interrupt output level into the kvm_run structure
+ */
+void kvm_pmu_update_run(struct kvm_vcpu *vcpu)
+{
+       struct kvm_sync_regs *regs = &vcpu->run->s.regs;
+
+       /* Populate the timer bitmap for user space */
+       regs->device_irq_level &= ~KVM_ARM_DEV_PMU;
+       if (vcpu->arch.pmu.irq_level)
+               regs->device_irq_level |= KVM_ARM_DEV_PMU;
+}
+
+/**
+ * kvm_pmu_flush_hwstate - flush pmu state to cpu
+ * @vcpu: The vcpu pointer
+ *
+ * Check if the PMU has overflowed while we were running in the host, and inject
+ * an interrupt if that was the case.
+ */
+void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+       kvm_pmu_update_state(vcpu);
+}
+
+/**
+ * kvm_pmu_sync_hwstate - sync pmu state from cpu
+ * @vcpu: The vcpu pointer
+ *
+ * Check if the PMU has overflowed while we were running in the guest, and
+ * inject an interrupt if that was the case.
+ */
+void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+       kvm_pmu_update_state(vcpu);
+}
+
+/**
+ * When the perf event overflows, set the overflow status and inform the vcpu.
+ */
+static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
+                                 struct perf_sample_data *data,
+                                 struct pt_regs *regs)
+{
+       struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+       struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu);
+       struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+       int idx = pmc->idx;
+       u64 period;
+
+       cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE);
+
+       /*
+        * Reset the sample period to the architectural limit,
+        * i.e. the point where the counter overflows.
+        */
+       period = -(local64_read(&perf_event->count));
+
+       if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
+               period &= GENMASK(31, 0);
+
+       local64_set(&perf_event->hw.period_left, 0);
+       perf_event->attr.sample_period = period;
+       perf_event->hw.sample_period = period;
+
+       __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
+
+       if (kvm_pmu_overflow_status(vcpu)) {
+               kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+               kvm_vcpu_kick(vcpu);
+       }
+
+       cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD);
+}
+
+/**
+ * kvm_pmu_software_increment - do software increment
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMSWINC register
+ */
+void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
+{
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       int i;
+
+       if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
+               return;
+
+       /* Weed out disabled counters */
+       val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+
+       for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) {
+               u64 type, reg;
+
+               if (!(val & BIT(i)))
+                       continue;
+
+               /* PMSWINC only applies to ... SW_INC! */
+               type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
+               type &= ARMV8_PMU_EVTYPE_EVENT;
+               if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
+                       continue;
+
+               /* increment this even SW_INC counter */
+               reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
+               reg = lower_32_bits(reg);
+               __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
+
+               if (reg) /* no overflow on the low part */
+                       continue;
+
+               if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) {
+                       /* increment the high counter */
+                       reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1;
+                       reg = lower_32_bits(reg);
+                       __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg;
+                       if (!reg) /* mark overflow on the high counter */
+                               __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1);
+               } else {
+                       /* mark overflow on low counter */
+                       __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
+               }
+       }
+}
+
+/**
+ * kvm_pmu_handle_pmcr - handle PMCR register
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMCR register
+ */
+void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
+{
+       unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
+       int i;
+
+       if (val & ARMV8_PMU_PMCR_E) {
+               kvm_pmu_enable_counter_mask(vcpu,
+                      __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
+       } else {
+               kvm_pmu_disable_counter_mask(vcpu, mask);
+       }
+
+       if (val & ARMV8_PMU_PMCR_C)
+               kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
+
+       if (val & ARMV8_PMU_PMCR_P) {
+               for_each_set_bit(i, &mask, 32)
+                       kvm_pmu_set_counter_value(vcpu, i, 0);
+       }
+}
+
+static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+       return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
+              (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx));
+}
+
+/**
+ * kvm_pmu_create_perf_event - create a perf event for a counter
+ * @vcpu: The vcpu pointer
+ * @select_idx: The number of selected counter
+ */
+static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       struct kvm_pmc *pmc;
+       struct perf_event *event;
+       struct perf_event_attr attr;
+       u64 eventsel, counter, reg, data;
+
+       /*
+        * For chained counters the event type and filtering attributes are
+        * obtained from the low/even counter. We also use this counter to
+        * determine if the event is enabled/disabled.
+        */
+       pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]);
+
+       reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
+             ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx;
+       data = __vcpu_sys_reg(vcpu, reg);
+
+       kvm_pmu_stop_counter(vcpu, pmc);
+       eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
+
+       /* Software increment event does't need to be backed by a perf event */
+       if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
+           pmc->idx != ARMV8_PMU_CYCLE_IDX)
+               return;
+
+       memset(&attr, 0, sizeof(struct perf_event_attr));
+       attr.type = PERF_TYPE_RAW;
+       attr.size = sizeof(attr);
+       attr.pinned = 1;
+       attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx);
+       attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0;
+       attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
+       attr.exclude_hv = 1; /* Don't count EL2 events */
+       attr.exclude_host = 1; /* Don't count host events */
+       attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
+               ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
+
+       counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
+
+       if (kvm_pmu_pmc_is_chained(pmc)) {
+               /**
+                * The initial sample period (overflow count) of an event. For
+                * chained counters we only support overflow interrupts on the
+                * high counter.
+                */
+               attr.sample_period = (-counter) & GENMASK(63, 0);
+               attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED;
+
+               event = perf_event_create_kernel_counter(&attr, -1, current,
+                                                        kvm_pmu_perf_overflow,
+                                                        pmc + 1);
+       } else {
+               /* The initial sample period (overflow count) of an event. */
+               if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
+                       attr.sample_period = (-counter) & GENMASK(63, 0);
+               else
+                       attr.sample_period = (-counter) & GENMASK(31, 0);
+
+               event = perf_event_create_kernel_counter(&attr, -1, current,
+                                                kvm_pmu_perf_overflow, pmc);
+       }
+
+       if (IS_ERR(event)) {
+               pr_err_once("kvm: pmu event creation failed %ld\n",
+                           PTR_ERR(event));
+               return;
+       }
+
+       pmc->perf_event = event;
+}
+
+/**
+ * kvm_pmu_update_pmc_chained - update chained bitmap
+ * @vcpu: The vcpu pointer
+ * @select_idx: The number of selected counter
+ *
+ * Update the chained bitmap based on the event type written in the
+ * typer register and the enable state of the odd register.
+ */
+static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc;
+       bool new_state, old_state;
+
+       old_state = kvm_pmu_pmc_is_chained(pmc);
+       new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) &&
+                   kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1);
+
+       if (old_state == new_state)
+               return;
+
+       canonical_pmc = kvm_pmu_get_canonical_pmc(pmc);
+       kvm_pmu_stop_counter(vcpu, canonical_pmc);
+       if (new_state) {
+               /*
+                * During promotion from !chained to chained we must ensure
+                * the adjacent counter is stopped and its event destroyed
+                */
+               kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc));
+               set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
+               return;
+       }
+       clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
+}
+
+/**
+ * kvm_pmu_set_counter_event_type - set selected counter to monitor some event
+ * @vcpu: The vcpu pointer
+ * @data: The data guest writes to PMXEVTYPER_EL0
+ * @select_idx: The number of selected counter
+ *
+ * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an
+ * event with given hardware event number. Here we call perf_event API to
+ * emulate this action and create a kernel perf event for it.
+ */
+void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
+                                   u64 select_idx)
+{
+       u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
+
+       reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
+             ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
+
+       __vcpu_sys_reg(vcpu, reg) = event_type;
+
+       kvm_pmu_update_pmc_chained(vcpu, select_idx);
+       kvm_pmu_create_perf_event(vcpu, select_idx);
+}
+
+bool kvm_arm_support_pmu_v3(void)
+{
+       /*
+        * Check if HW_PERF_EVENTS are supported by checking the number of
+        * hardware performance counters. This could ensure the presence of
+        * a physical PMU and CONFIG_PERF_EVENT is selected.
+        */
+       return (perf_num_counters() > 0);
+}
+
+int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->arch.pmu.created)
+               return 0;
+
+       /*
+        * A valid interrupt configuration for the PMU is either to have a
+        * properly configured interrupt number and using an in-kernel
+        * irqchip, or to not have an in-kernel GIC and not set an IRQ.
+        */
+       if (irqchip_in_kernel(vcpu->kvm)) {
+               int irq = vcpu->arch.pmu.irq_num;
+               if (!kvm_arm_pmu_irq_initialized(vcpu))
+                       return -EINVAL;
+
+               /*
+                * If we are using an in-kernel vgic, at this point we know
+                * the vgic will be initialized, so we can check the PMU irq
+                * number against the dimensions of the vgic and make sure
+                * it's valid.
+                */
+               if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq))
+                       return -EINVAL;
+       } else if (kvm_arm_pmu_irq_initialized(vcpu)) {
+                  return -EINVAL;
+       }
+
+       kvm_pmu_vcpu_reset(vcpu);
+       vcpu->arch.pmu.ready = true;
+
+       return 0;
+}
+
+static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
+{
+       if (!kvm_arm_support_pmu_v3())
+               return -ENODEV;
+
+       if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+               return -ENXIO;
+
+       if (vcpu->arch.pmu.created)
+               return -EBUSY;
+
+       if (irqchip_in_kernel(vcpu->kvm)) {
+               int ret;
+
+               /*
+                * If using the PMU with an in-kernel virtual GIC
+                * implementation, we require the GIC to be already
+                * initialized when initializing the PMU.
+                */
+               if (!vgic_initialized(vcpu->kvm))
+                       return -ENODEV;
+
+               if (!kvm_arm_pmu_irq_initialized(vcpu))
+                       return -ENXIO;
+
+               ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num,
+                                        &vcpu->arch.pmu);
+               if (ret)
+                       return ret;
+       }
+
+       vcpu->arch.pmu.created = true;
+       return 0;
+}
+
+/*
+ * For one VM the interrupt type must be same for each vcpu.
+ * As a PPI, the interrupt number is the same for all vcpus,
+ * while as an SPI it must be a separate number per vcpu.
+ */
+static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
+{
+       int i;
+       struct kvm_vcpu *vcpu;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (!kvm_arm_pmu_irq_initialized(vcpu))
+                       continue;
+
+               if (irq_is_ppi(irq)) {
+                       if (vcpu->arch.pmu.irq_num != irq)
+                               return false;
+               } else {
+                       if (vcpu->arch.pmu.irq_num == irq)
+                               return false;
+               }
+       }
+
+       return true;
+}
+
+int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+       switch (attr->attr) {
+       case KVM_ARM_VCPU_PMU_V3_IRQ: {
+               int __user *uaddr = (int __user *)(long)attr->addr;
+               int irq;
+
+               if (!irqchip_in_kernel(vcpu->kvm))
+                       return -EINVAL;
+
+               if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+                       return -ENODEV;
+
+               if (get_user(irq, uaddr))
+                       return -EFAULT;
+
+               /* The PMU overflow interrupt can be a PPI or a valid SPI. */
+               if (!(irq_is_ppi(irq) || irq_is_spi(irq)))
+                       return -EINVAL;
+
+               if (!pmu_irq_is_valid(vcpu->kvm, irq))
+                       return -EINVAL;
+
+               if (kvm_arm_pmu_irq_initialized(vcpu))
+                       return -EBUSY;
+
+               kvm_debug("Set kvm ARM PMU irq: %d\n", irq);
+               vcpu->arch.pmu.irq_num = irq;
+               return 0;
+       }
+       case KVM_ARM_VCPU_PMU_V3_INIT:
+               return kvm_arm_pmu_v3_init(vcpu);
+       }
+
+       return -ENXIO;
+}
+
+int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+       switch (attr->attr) {
+       case KVM_ARM_VCPU_PMU_V3_IRQ: {
+               int __user *uaddr = (int __user *)(long)attr->addr;
+               int irq;
+
+               if (!irqchip_in_kernel(vcpu->kvm))
+                       return -EINVAL;
+
+               if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+                       return -ENODEV;
+
+               if (!kvm_arm_pmu_irq_initialized(vcpu))
+                       return -ENXIO;
+
+               irq = vcpu->arch.pmu.irq_num;
+               return put_user(irq, uaddr);
+       }
+       }
+
+       return -ENXIO;
+}
+
+int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+       switch (attr->attr) {
+       case KVM_ARM_VCPU_PMU_V3_IRQ:
+       case KVM_ARM_VCPU_PMU_V3_INIT:
+               if (kvm_arm_support_pmu_v3() &&
+                   test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+                       return 0;
+       }
+
+       return -ENXIO;
+}
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
new file mode 100644 (file)
index 0000000..ae36471
--- /dev/null
@@ -0,0 +1,564 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/preempt.h>
+#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
+#include <linux/wait.h>
+
+#include <asm/cputype.h>
+#include <asm/kvm_emulate.h>
+
+#include <kvm/arm_psci.h>
+#include <kvm/arm_hypercalls.h>
+
+/*
+ * This is an implementation of the Power State Coordination Interface
+ * as described in ARM document number ARM DEN 0022A.
+ */
+
+#define AFFINITY_MASK(level)   ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
+
+static unsigned long psci_affinity_mask(unsigned long affinity_level)
+{
+       if (affinity_level <= 3)
+               return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);
+
+       return 0;
+}
+
+static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
+{
+       /*
+        * NOTE: For simplicity, we make VCPU suspend emulation to be
+        * same-as WFI (Wait-for-interrupt) emulation.
+        *
+        * This means for KVM the wakeup events are interrupts and
+        * this is consistent with intended use of StateID as described
+        * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A).
+        *
+        * Further, we also treat power-down request to be same as
+        * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2
+        * specification (ARM DEN 0022A). This means all suspend states
+        * for KVM will preserve the register state.
+        */
+       kvm_vcpu_block(vcpu);
+       kvm_clear_request(KVM_REQ_UNHALT, vcpu);
+
+       return PSCI_RET_SUCCESS;
+}
+
+static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.power_off = true;
+       kvm_make_request(KVM_REQ_SLEEP, vcpu);
+       kvm_vcpu_kick(vcpu);
+}
+
+static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
+{
+       struct vcpu_reset_state *reset_state;
+       struct kvm *kvm = source_vcpu->kvm;
+       struct kvm_vcpu *vcpu = NULL;
+       unsigned long cpu_id;
+
+       cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK;
+       if (vcpu_mode_is_32bit(source_vcpu))
+               cpu_id &= ~((u32) 0);
+
+       vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);
+
+       /*
+        * Make sure the caller requested a valid CPU and that the CPU is
+        * turned off.
+        */
+       if (!vcpu)
+               return PSCI_RET_INVALID_PARAMS;
+       if (!vcpu->arch.power_off) {
+               if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1)
+                       return PSCI_RET_ALREADY_ON;
+               else
+                       return PSCI_RET_INVALID_PARAMS;
+       }
+
+       reset_state = &vcpu->arch.reset_state;
+
+       reset_state->pc = smccc_get_arg2(source_vcpu);
+
+       /* Propagate caller endianness */
+       reset_state->be = kvm_vcpu_is_be(source_vcpu);
+
+       /*
+        * NOTE: We always update r0 (or x0) because for PSCI v0.1
+        * the general puspose registers are undefined upon CPU_ON.
+        */
+       reset_state->r0 = smccc_get_arg3(source_vcpu);
+
+       WRITE_ONCE(reset_state->reset, true);
+       kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
+
+       /*
+        * Make sure the reset request is observed if the change to
+        * power_state is observed.
+        */
+       smp_wmb();
+
+       vcpu->arch.power_off = false;
+       kvm_vcpu_wake_up(vcpu);
+
+       return PSCI_RET_SUCCESS;
+}
+
+static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
+{
+       int i, matching_cpus = 0;
+       unsigned long mpidr;
+       unsigned long target_affinity;
+       unsigned long target_affinity_mask;
+       unsigned long lowest_affinity_level;
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_vcpu *tmp;
+
+       target_affinity = smccc_get_arg1(vcpu);
+       lowest_affinity_level = smccc_get_arg2(vcpu);
+
+       /* Determine target affinity mask */
+       target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
+       if (!target_affinity_mask)
+               return PSCI_RET_INVALID_PARAMS;
+
+       /* Ignore other bits of target affinity */
+       target_affinity &= target_affinity_mask;
+
+       /*
+        * If one or more VCPU matching target affinity are running
+        * then ON else OFF
+        */
+       kvm_for_each_vcpu(i, tmp, kvm) {
+               mpidr = kvm_vcpu_get_mpidr_aff(tmp);
+               if ((mpidr & target_affinity_mask) == target_affinity) {
+                       matching_cpus++;
+                       if (!tmp->arch.power_off)
+                               return PSCI_0_2_AFFINITY_LEVEL_ON;
+               }
+       }
+
+       if (!matching_cpus)
+               return PSCI_RET_INVALID_PARAMS;
+
+       return PSCI_0_2_AFFINITY_LEVEL_OFF;
+}
+
+static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
+{
+       int i;
+       struct kvm_vcpu *tmp;
+
+       /*
+        * The KVM ABI specifies that a system event exit may call KVM_RUN
+        * again and may perform shutdown/reboot at a later time that when the
+        * actual request is made.  Since we are implementing PSCI and a
+        * caller of PSCI reboot and shutdown expects that the system shuts
+        * down or reboots immediately, let's make sure that VCPUs are not run
+        * after this call is handled and before the VCPUs have been
+        * re-initialized.
+        */
+       kvm_for_each_vcpu(i, tmp, vcpu->kvm)
+               tmp->arch.power_off = true;
+       kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
+
+       memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
+       vcpu->run->system_event.type = type;
+       vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+}
+
+static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
+{
+       kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN);
+}
+
+static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
+{
+       kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
+}
+
+static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
+{
+       int i;
+
+       /*
+        * Zero the input registers' upper 32 bits. They will be fully
+        * zeroed on exit, so we're fine changing them in place.
+        */
+       for (i = 1; i < 4; i++)
+               vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i)));
+}
+
+static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn)
+{
+       switch(fn) {
+       case PSCI_0_2_FN64_CPU_SUSPEND:
+       case PSCI_0_2_FN64_CPU_ON:
+       case PSCI_0_2_FN64_AFFINITY_INFO:
+               /* Disallow these functions for 32bit guests */
+               if (vcpu_mode_is_32bit(vcpu))
+                       return PSCI_RET_NOT_SUPPORTED;
+               break;
+       }
+
+       return 0;
+}
+
+static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       u32 psci_fn = smccc_get_function(vcpu);
+       unsigned long val;
+       int ret = 1;
+
+       val = kvm_psci_check_allowed_function(vcpu, psci_fn);
+       if (val)
+               goto out;
+
+       switch (psci_fn) {
+       case PSCI_0_2_FN_PSCI_VERSION:
+               /*
+                * Bits[31:16] = Major Version = 0
+                * Bits[15:0] = Minor Version = 2
+                */
+               val = KVM_ARM_PSCI_0_2;
+               break;
+       case PSCI_0_2_FN_CPU_SUSPEND:
+       case PSCI_0_2_FN64_CPU_SUSPEND:
+               val = kvm_psci_vcpu_suspend(vcpu);
+               break;
+       case PSCI_0_2_FN_CPU_OFF:
+               kvm_psci_vcpu_off(vcpu);
+               val = PSCI_RET_SUCCESS;
+               break;
+       case PSCI_0_2_FN_CPU_ON:
+               kvm_psci_narrow_to_32bit(vcpu);
+               fallthrough;
+       case PSCI_0_2_FN64_CPU_ON:
+               mutex_lock(&kvm->lock);
+               val = kvm_psci_vcpu_on(vcpu);
+               mutex_unlock(&kvm->lock);
+               break;
+       case PSCI_0_2_FN_AFFINITY_INFO:
+               kvm_psci_narrow_to_32bit(vcpu);
+               fallthrough;
+       case PSCI_0_2_FN64_AFFINITY_INFO:
+               val = kvm_psci_vcpu_affinity_info(vcpu);
+               break;
+       case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+               /*
+                * Trusted OS is MP hence does not require migration
+                * or
+                * Trusted OS is not present
+                */
+               val = PSCI_0_2_TOS_MP;
+               break;
+       case PSCI_0_2_FN_SYSTEM_OFF:
+               kvm_psci_system_off(vcpu);
+               /*
+                * We should'nt be going back to guest VCPU after
+                * receiving SYSTEM_OFF request.
+                *
+                * If user space accidently/deliberately resumes
+                * guest VCPU after SYSTEM_OFF request then guest
+                * VCPU should see internal failure from PSCI return
+                * value. To achieve this, we preload r0 (or x0) with
+                * PSCI return value INTERNAL_FAILURE.
+                */
+               val = PSCI_RET_INTERNAL_FAILURE;
+               ret = 0;
+               break;
+       case PSCI_0_2_FN_SYSTEM_RESET:
+               kvm_psci_system_reset(vcpu);
+               /*
+                * Same reason as SYSTEM_OFF for preloading r0 (or x0)
+                * with PSCI return value INTERNAL_FAILURE.
+                */
+               val = PSCI_RET_INTERNAL_FAILURE;
+               ret = 0;
+               break;
+       default:
+               val = PSCI_RET_NOT_SUPPORTED;
+               break;
+       }
+
+out:
+       smccc_set_retval(vcpu, val, 0, 0, 0);
+       return ret;
+}
+
+static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
+{
+       u32 psci_fn = smccc_get_function(vcpu);
+       u32 feature;
+       unsigned long val;
+       int ret = 1;
+
+       switch(psci_fn) {
+       case PSCI_0_2_FN_PSCI_VERSION:
+               val = KVM_ARM_PSCI_1_0;
+               break;
+       case PSCI_1_0_FN_PSCI_FEATURES:
+               feature = smccc_get_arg1(vcpu);
+               val = kvm_psci_check_allowed_function(vcpu, feature);
+               if (val)
+                       break;
+
+               switch(feature) {
+               case PSCI_0_2_FN_PSCI_VERSION:
+               case PSCI_0_2_FN_CPU_SUSPEND:
+               case PSCI_0_2_FN64_CPU_SUSPEND:
+               case PSCI_0_2_FN_CPU_OFF:
+               case PSCI_0_2_FN_CPU_ON:
+               case PSCI_0_2_FN64_CPU_ON:
+               case PSCI_0_2_FN_AFFINITY_INFO:
+               case PSCI_0_2_FN64_AFFINITY_INFO:
+               case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+               case PSCI_0_2_FN_SYSTEM_OFF:
+               case PSCI_0_2_FN_SYSTEM_RESET:
+               case PSCI_1_0_FN_PSCI_FEATURES:
+               case ARM_SMCCC_VERSION_FUNC_ID:
+                       val = 0;
+                       break;
+               default:
+                       val = PSCI_RET_NOT_SUPPORTED;
+                       break;
+               }
+               break;
+       default:
+               return kvm_psci_0_2_call(vcpu);
+       }
+
+       smccc_set_retval(vcpu, val, 0, 0, 0);
+       return ret;
+}
+
+static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       u32 psci_fn = smccc_get_function(vcpu);
+       unsigned long val;
+
+       switch (psci_fn) {
+       case KVM_PSCI_FN_CPU_OFF:
+               kvm_psci_vcpu_off(vcpu);
+               val = PSCI_RET_SUCCESS;
+               break;
+       case KVM_PSCI_FN_CPU_ON:
+               mutex_lock(&kvm->lock);
+               val = kvm_psci_vcpu_on(vcpu);
+               mutex_unlock(&kvm->lock);
+               break;
+       default:
+               val = PSCI_RET_NOT_SUPPORTED;
+               break;
+       }
+
+       smccc_set_retval(vcpu, val, 0, 0, 0);
+       return 1;
+}
+
+/**
+ * kvm_psci_call - handle PSCI call if r0 value is in range
+ * @vcpu: Pointer to the VCPU struct
+ *
+ * Handle PSCI calls from guests through traps from HVC instructions.
+ * The calling convention is similar to SMC calls to the secure world
+ * where the function number is placed in r0.
+ *
+ * This function returns: > 0 (success), 0 (success but exit to user
+ * space), and < 0 (errors)
+ *
+ * Errors:
+ * -EINVAL: Unrecognized PSCI function
+ */
+int kvm_psci_call(struct kvm_vcpu *vcpu)
+{
+       switch (kvm_psci_version(vcpu, vcpu->kvm)) {
+       case KVM_ARM_PSCI_1_0:
+               return kvm_psci_1_0_call(vcpu);
+       case KVM_ARM_PSCI_0_2:
+               return kvm_psci_0_2_call(vcpu);
+       case KVM_ARM_PSCI_0_1:
+               return kvm_psci_0_1_call(vcpu);
+       default:
+               return -EINVAL;
+       };
+}
+
+int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
+{
+       return 3;               /* PSCI version and two workaround registers */
+}
+
+int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
+{
+       if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices++))
+               return -EFAULT;
+
+       if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, uindices++))
+               return -EFAULT;
+
+       if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++))
+               return -EFAULT;
+
+       return 0;
+}
+
+#define KVM_REG_FEATURE_LEVEL_WIDTH    4
+#define KVM_REG_FEATURE_LEVEL_MASK     (BIT(KVM_REG_FEATURE_LEVEL_WIDTH) - 1)
+
+/*
+ * Convert the workaround level into an easy-to-compare number, where higher
+ * values mean better protection.
+ */
+static int get_kernel_wa_level(u64 regid)
+{
+       switch (regid) {
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+               switch (kvm_arm_harden_branch_predictor()) {
+               case KVM_BP_HARDEN_UNKNOWN:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
+               case KVM_BP_HARDEN_WA_NEEDED:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
+               case KVM_BP_HARDEN_NOT_REQUIRED:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
+               }
+               return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+               switch (kvm_arm_have_ssbd()) {
+               case KVM_SSBD_FORCE_DISABLE:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
+               case KVM_SSBD_KERNEL:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL;
+               case KVM_SSBD_FORCE_ENABLE:
+               case KVM_SSBD_MITIGATED:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
+               case KVM_SSBD_UNKNOWN:
+               default:
+                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN;
+               }
+       }
+
+       return -EINVAL;
+}
+
+int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+       void __user *uaddr = (void __user *)(long)reg->addr;
+       u64 val;
+
+       switch (reg->id) {
+       case KVM_REG_ARM_PSCI_VERSION:
+               val = kvm_psci_version(vcpu, vcpu->kvm);
+               break;
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+               val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
+               break;
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+               val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
+
+               if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
+                   kvm_arm_get_vcpu_workaround_2_flag(vcpu))
+                       val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED;
+               break;
+       default:
+               return -ENOENT;
+       }
+
+       if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)))
+               return -EFAULT;
+
+       return 0;
+}
+
+int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+       void __user *uaddr = (void __user *)(long)reg->addr;
+       u64 val;
+       int wa_level;
+
+       if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)))
+               return -EFAULT;
+
+       switch (reg->id) {
+       case KVM_REG_ARM_PSCI_VERSION:
+       {
+               bool wants_02;
+
+               wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features);
+
+               switch (val) {
+               case KVM_ARM_PSCI_0_1:
+                       if (wants_02)
+                               return -EINVAL;
+                       vcpu->kvm->arch.psci_version = val;
+                       return 0;
+               case KVM_ARM_PSCI_0_2:
+               case KVM_ARM_PSCI_1_0:
+                       if (!wants_02)
+                               return -EINVAL;
+                       vcpu->kvm->arch.psci_version = val;
+                       return 0;
+               }
+               break;
+       }
+
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+               if (val & ~KVM_REG_FEATURE_LEVEL_MASK)
+                       return -EINVAL;
+
+               if (get_kernel_wa_level(reg->id) < val)
+                       return -EINVAL;
+
+               return 0;
+
+       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+               if (val & ~(KVM_REG_FEATURE_LEVEL_MASK |
+                           KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
+                       return -EINVAL;
+
+               wa_level = val & KVM_REG_FEATURE_LEVEL_MASK;
+
+               if (get_kernel_wa_level(reg->id) < wa_level)
+                       return -EINVAL;
+
+               /* The enabled bit must not be set unless the level is AVAIL. */
+               if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
+                   wa_level != val)
+                       return -EINVAL;
+
+               /* Are we finished or do we need to check the enable bit ? */
+               if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL)
+                       return 0;
+
+               /*
+                * If this kernel supports the workaround to be switched on
+                * or off, make sure it matches the requested setting.
+                */
+               switch (wa_level) {
+               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
+                       kvm_arm_set_vcpu_workaround_2_flag(vcpu,
+                           val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED);
+                       break;
+               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
+                       kvm_arm_set_vcpu_workaround_2_flag(vcpu, true);
+                       break;
+               }
+
+               return 0;
+       default:
+               return -ENOENT;
+       }
+
+       return -EINVAL;
+}
diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c
new file mode 100644 (file)
index 0000000..1e0f4c2
--- /dev/null
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Arm Ltd.
+
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_mmu.h>
+#include <asm/pvclock-abi.h>
+
+#include <kvm/arm_hypercalls.h>
+
+void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       u64 steal;
+       __le64 steal_le;
+       u64 offset;
+       int idx;
+       u64 base = vcpu->arch.steal.base;
+
+       if (base == GPA_INVALID)
+               return;
+
+       /* Let's do the local bookkeeping */
+       steal = vcpu->arch.steal.steal;
+       steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal;
+       vcpu->arch.steal.last_steal = current->sched_info.run_delay;
+       vcpu->arch.steal.steal = steal;
+
+       steal_le = cpu_to_le64(steal);
+       idx = srcu_read_lock(&kvm->srcu);
+       offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
+       kvm_put_guest(kvm, base + offset, steal_le, u64);
+       srcu_read_unlock(&kvm->srcu, idx);
+}
+
+long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
+{
+       u32 feature = smccc_get_arg1(vcpu);
+       long val = SMCCC_RET_NOT_SUPPORTED;
+
+       switch (feature) {
+       case ARM_SMCCC_HV_PV_TIME_FEATURES:
+       case ARM_SMCCC_HV_PV_TIME_ST:
+               val = SMCCC_RET_SUCCESS;
+               break;
+       }
+
+       return val;
+}
+
+gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
+{
+       struct pvclock_vcpu_stolen_time init_values = {};
+       struct kvm *kvm = vcpu->kvm;
+       u64 base = vcpu->arch.steal.base;
+       int idx;
+
+       if (base == GPA_INVALID)
+               return base;
+
+       /*
+        * Start counting stolen time from the time the guest requests
+        * the feature enabled.
+        */
+       vcpu->arch.steal.steal = 0;
+       vcpu->arch.steal.last_steal = current->sched_info.run_delay;
+
+       idx = srcu_read_lock(&kvm->srcu);
+       kvm_write_guest(kvm, base, &init_values, sizeof(init_values));
+       srcu_read_unlock(&kvm->srcu, idx);
+
+       return base;
+}
+
+int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
+                           struct kvm_device_attr *attr)
+{
+       u64 __user *user = (u64 __user *)attr->addr;
+       struct kvm *kvm = vcpu->kvm;
+       u64 ipa;
+       int ret = 0;
+       int idx;
+
+       if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
+               return -ENXIO;
+
+       if (get_user(ipa, user))
+               return -EFAULT;
+       if (!IS_ALIGNED(ipa, 64))
+               return -EINVAL;
+       if (vcpu->arch.steal.base != GPA_INVALID)
+               return -EEXIST;
+
+       /* Check the address is in a valid memslot */
+       idx = srcu_read_lock(&kvm->srcu);
+       if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT)))
+               ret = -EINVAL;
+       srcu_read_unlock(&kvm->srcu, idx);
+
+       if (!ret)
+               vcpu->arch.steal.base = ipa;
+
+       return ret;
+}
+
+int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
+                           struct kvm_device_attr *attr)
+{
+       u64 __user *user = (u64 __user *)attr->addr;
+       u64 ipa;
+
+       if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
+               return -ENXIO;
+
+       ipa = vcpu->arch.steal.base;
+
+       if (put_user(ipa, user))
+               return -EFAULT;
+       return 0;
+}
+
+int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
+                           struct kvm_device_attr *attr)
+{
+       switch (attr->attr) {
+       case KVM_ARM_VCPU_PVTIME_IPA:
+               return 0;
+       }
+       return -ENXIO;
+}
index eab91ad0effbe3784f23a4ae307773dca2ae65df..86f9ea47be297a476e7d38973c3711a029d1ecc6 100644 (file)
@@ -1,216 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(_TRACE_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#ifndef _TRACE_ARM64_KVM_H
 #define _TRACE_ARM64_KVM_H
 
-#include <linux/tracepoint.h>
-#include "sys_regs.h"
+#include "trace_arm.h"
+#include "trace_handle_exit.h"
 
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvm
-
-TRACE_EVENT(kvm_wfx_arm64,
-       TP_PROTO(unsigned long vcpu_pc, bool is_wfe),
-       TP_ARGS(vcpu_pc, is_wfe),
-
-       TP_STRUCT__entry(
-               __field(unsigned long,  vcpu_pc)
-               __field(bool,           is_wfe)
-       ),
-
-       TP_fast_assign(
-               __entry->vcpu_pc = vcpu_pc;
-               __entry->is_wfe  = is_wfe;
-       ),
-
-       TP_printk("guest executed wf%c at: 0x%08lx",
-                 __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_hvc_arm64,
-       TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm),
-       TP_ARGS(vcpu_pc, r0, imm),
-
-       TP_STRUCT__entry(
-               __field(unsigned long, vcpu_pc)
-               __field(unsigned long, r0)
-               __field(unsigned long, imm)
-       ),
-
-       TP_fast_assign(
-               __entry->vcpu_pc = vcpu_pc;
-               __entry->r0 = r0;
-               __entry->imm = imm;
-       ),
-
-       TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)",
-                 __entry->vcpu_pc, __entry->r0, __entry->imm)
-);
-
-TRACE_EVENT(kvm_arm_setup_debug,
-       TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
-       TP_ARGS(vcpu, guest_debug),
-
-       TP_STRUCT__entry(
-               __field(struct kvm_vcpu *, vcpu)
-               __field(__u32, guest_debug)
-       ),
-
-       TP_fast_assign(
-               __entry->vcpu = vcpu;
-               __entry->guest_debug = guest_debug;
-       ),
-
-       TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
-);
-
-TRACE_EVENT(kvm_arm_clear_debug,
-       TP_PROTO(__u32 guest_debug),
-       TP_ARGS(guest_debug),
-
-       TP_STRUCT__entry(
-               __field(__u32, guest_debug)
-       ),
-
-       TP_fast_assign(
-               __entry->guest_debug = guest_debug;
-       ),
-
-       TP_printk("flags: 0x%08x", __entry->guest_debug)
-);
-
-TRACE_EVENT(kvm_arm_set_dreg32,
-       TP_PROTO(const char *name, __u32 value),
-       TP_ARGS(name, value),
-
-       TP_STRUCT__entry(
-               __field(const char *, name)
-               __field(__u32, value)
-       ),
-
-       TP_fast_assign(
-               __entry->name = name;
-               __entry->value = value;
-       ),
-
-       TP_printk("%s: 0x%08x", __entry->name, __entry->value)
-);
-
-TRACE_DEFINE_SIZEOF(__u64);
-
-TRACE_EVENT(kvm_arm_set_regset,
-       TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
-       TP_ARGS(type, len, control, value),
-       TP_STRUCT__entry(
-               __field(const char *, name)
-               __field(int, len)
-               __array(u64, ctrls, 16)
-               __array(u64, values, 16)
-       ),
-       TP_fast_assign(
-               __entry->name = type;
-               __entry->len = len;
-               memcpy(__entry->ctrls, control, len << 3);
-               memcpy(__entry->values, value, len << 3);
-       ),
-       TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
-               __print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
-               __print_array(__entry->values, __entry->len, sizeof(__u64)))
-);
-
-TRACE_EVENT(trap_reg,
-       TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
-       TP_ARGS(fn, reg, is_write, write_value),
-
-       TP_STRUCT__entry(
-               __field(const char *, fn)
-               __field(int, reg)
-               __field(bool, is_write)
-               __field(u64, write_value)
-       ),
-
-       TP_fast_assign(
-               __entry->fn = fn;
-               __entry->reg = reg;
-               __entry->is_write = is_write;
-               __entry->write_value = write_value;
-       ),
-
-       TP_printk("%s %s reg %d (0x%08llx)", __entry->fn,  __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
-);
-
-TRACE_EVENT(kvm_handle_sys_reg,
-       TP_PROTO(unsigned long hsr),
-       TP_ARGS(hsr),
-
-       TP_STRUCT__entry(
-               __field(unsigned long,  hsr)
-       ),
-
-       TP_fast_assign(
-               __entry->hsr = hsr;
-       ),
-
-       TP_printk("HSR 0x%08lx", __entry->hsr)
-);
-
-TRACE_EVENT(kvm_sys_access,
-       TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg),
-       TP_ARGS(vcpu_pc, params, reg),
-
-       TP_STRUCT__entry(
-               __field(unsigned long,                  vcpu_pc)
-               __field(bool,                           is_write)
-               __field(const char *,                   name)
-               __field(u8,                             Op0)
-               __field(u8,                             Op1)
-               __field(u8,                             CRn)
-               __field(u8,                             CRm)
-               __field(u8,                             Op2)
-       ),
-
-       TP_fast_assign(
-               __entry->vcpu_pc = vcpu_pc;
-               __entry->is_write = params->is_write;
-               __entry->name = reg->name;
-               __entry->Op0 = reg->Op0;
-               __entry->Op0 = reg->Op0;
-               __entry->Op1 = reg->Op1;
-               __entry->CRn = reg->CRn;
-               __entry->CRm = reg->CRm;
-               __entry->Op2 = reg->Op2;
-       ),
-
-       TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s",
-                 __entry->vcpu_pc, __entry->name ?: "UNKN",
-                 __entry->Op0, __entry->Op1, __entry->CRn,
-                 __entry->CRm, __entry->Op2,
-                 __entry->is_write ? "write" : "read")
-);
-
-TRACE_EVENT(kvm_set_guest_debug,
-       TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
-       TP_ARGS(vcpu, guest_debug),
-
-       TP_STRUCT__entry(
-               __field(struct kvm_vcpu *, vcpu)
-               __field(__u32, guest_debug)
-       ),
-
-       TP_fast_assign(
-               __entry->vcpu = vcpu;
-               __entry->guest_debug = guest_debug;
-       ),
-
-       TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
-);
-
-
-#endif /* _TRACE_ARM64_KVM_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
+#endif /* _TRACE_ARM64_KVM_H */
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
new file mode 100644 (file)
index 0000000..4c71270
--- /dev/null
@@ -0,0 +1,378 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_ARM_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ARM_ARM64_KVM_H
+
+#include <kvm/arm_arch_timer.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+/*
+ * Tracepoints for entry/exit to guest
+ */
+TRACE_EVENT(kvm_entry,
+       TP_PROTO(unsigned long vcpu_pc),
+       TP_ARGS(vcpu_pc),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  vcpu_pc         )
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu_pc                = vcpu_pc;
+       ),
+
+       TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_exit,
+       TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc),
+       TP_ARGS(ret, esr_ec, vcpu_pc),
+
+       TP_STRUCT__entry(
+               __field(        int,            ret             )
+               __field(        unsigned int,   esr_ec          )
+               __field(        unsigned long,  vcpu_pc         )
+       ),
+
+       TP_fast_assign(
+               __entry->ret                    = ARM_EXCEPTION_CODE(ret);
+               __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0;
+               __entry->vcpu_pc                = vcpu_pc;
+       ),
+
+       TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
+                 __print_symbolic(__entry->ret, kvm_arm_exception_type),
+                 __entry->esr_ec,
+                 __print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
+                 __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_guest_fault,
+       TP_PROTO(unsigned long vcpu_pc, unsigned long hsr,
+                unsigned long hxfar,
+                unsigned long long ipa),
+       TP_ARGS(vcpu_pc, hsr, hxfar, ipa),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  vcpu_pc         )
+               __field(        unsigned long,  hsr             )
+               __field(        unsigned long,  hxfar           )
+               __field(   unsigned long long,  ipa             )
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu_pc                = vcpu_pc;
+               __entry->hsr                    = hsr;
+               __entry->hxfar                  = hxfar;
+               __entry->ipa                    = ipa;
+       ),
+
+       TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
+                 __entry->ipa, __entry->hsr,
+                 __entry->hxfar, __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_access_fault,
+       TP_PROTO(unsigned long ipa),
+       TP_ARGS(ipa),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  ipa             )
+       ),
+
+       TP_fast_assign(
+               __entry->ipa            = ipa;
+       ),
+
+       TP_printk("IPA: %lx", __entry->ipa)
+);
+
+TRACE_EVENT(kvm_irq_line,
+       TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level),
+       TP_ARGS(type, vcpu_idx, irq_num, level),
+
+       TP_STRUCT__entry(
+               __field(        unsigned int,   type            )
+               __field(        int,            vcpu_idx        )
+               __field(        int,            irq_num         )
+               __field(        int,            level           )
+       ),
+
+       TP_fast_assign(
+               __entry->type           = type;
+               __entry->vcpu_idx       = vcpu_idx;
+               __entry->irq_num        = irq_num;
+               __entry->level          = level;
+       ),
+
+       TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d",
+                 (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" :
+                 (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" :
+                 (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN",
+                 __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level)
+);
+
+TRACE_EVENT(kvm_mmio_emulate,
+       TP_PROTO(unsigned long vcpu_pc, unsigned long instr,
+                unsigned long cpsr),
+       TP_ARGS(vcpu_pc, instr, cpsr),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  vcpu_pc         )
+               __field(        unsigned long,  instr           )
+               __field(        unsigned long,  cpsr            )
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu_pc                = vcpu_pc;
+               __entry->instr                  = instr;
+               __entry->cpsr                   = cpsr;
+       ),
+
+       TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)",
+                 __entry->vcpu_pc, __entry->instr, __entry->cpsr)
+);
+
+TRACE_EVENT(kvm_unmap_hva_range,
+       TP_PROTO(unsigned long start, unsigned long end),
+       TP_ARGS(start, end),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  start           )
+               __field(        unsigned long,  end             )
+       ),
+
+       TP_fast_assign(
+               __entry->start          = start;
+               __entry->end            = end;
+       ),
+
+       TP_printk("mmu notifier unmap range: %#08lx -- %#08lx",
+                 __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_set_spte_hva,
+       TP_PROTO(unsigned long hva),
+       TP_ARGS(hva),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  hva             )
+       ),
+
+       TP_fast_assign(
+               __entry->hva            = hva;
+       ),
+
+       TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva)
+);
+
+TRACE_EVENT(kvm_age_hva,
+       TP_PROTO(unsigned long start, unsigned long end),
+       TP_ARGS(start, end),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  start           )
+               __field(        unsigned long,  end             )
+       ),
+
+       TP_fast_assign(
+               __entry->start          = start;
+               __entry->end            = end;
+       ),
+
+       TP_printk("mmu notifier age hva: %#08lx -- %#08lx",
+                 __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_test_age_hva,
+       TP_PROTO(unsigned long hva),
+       TP_ARGS(hva),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  hva             )
+       ),
+
+       TP_fast_assign(
+               __entry->hva            = hva;
+       ),
+
+       TP_printk("mmu notifier test age hva: %#08lx", __entry->hva)
+);
+
+TRACE_EVENT(kvm_set_way_flush,
+           TP_PROTO(unsigned long vcpu_pc, bool cache),
+           TP_ARGS(vcpu_pc, cache),
+
+           TP_STRUCT__entry(
+                   __field(    unsigned long,  vcpu_pc         )
+                   __field(    bool,           cache           )
+           ),
+
+           TP_fast_assign(
+                   __entry->vcpu_pc            = vcpu_pc;
+                   __entry->cache              = cache;
+           ),
+
+           TP_printk("S/W flush at 0x%016lx (cache %s)",
+                     __entry->vcpu_pc, __entry->cache ? "on" : "off")
+);
+
+TRACE_EVENT(kvm_toggle_cache,
+           TP_PROTO(unsigned long vcpu_pc, bool was, bool now),
+           TP_ARGS(vcpu_pc, was, now),
+
+           TP_STRUCT__entry(
+                   __field(    unsigned long,  vcpu_pc         )
+                   __field(    bool,           was             )
+                   __field(    bool,           now             )
+           ),
+
+           TP_fast_assign(
+                   __entry->vcpu_pc            = vcpu_pc;
+                   __entry->was                = was;
+                   __entry->now                = now;
+           ),
+
+           TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
+                     __entry->vcpu_pc, __entry->was ? "on" : "off",
+                     __entry->now ? "on" : "off")
+);
+
+/*
+ * Tracepoints for arch_timer
+ */
+TRACE_EVENT(kvm_timer_update_irq,
+       TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
+       TP_ARGS(vcpu_id, irq, level),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  vcpu_id )
+               __field(        __u32,          irq     )
+               __field(        int,            level   )
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu_id        = vcpu_id;
+               __entry->irq            = irq;
+               __entry->level          = level;
+       ),
+
+       TP_printk("VCPU: %ld, IRQ %d, level %d",
+                 __entry->vcpu_id, __entry->irq, __entry->level)
+);
+
+TRACE_EVENT(kvm_get_timer_map,
+       TP_PROTO(unsigned long vcpu_id, struct timer_map *map),
+       TP_ARGS(vcpu_id, map),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,          vcpu_id )
+               __field(        int,                    direct_vtimer   )
+               __field(        int,                    direct_ptimer   )
+               __field(        int,                    emul_ptimer     )
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu_id                = vcpu_id;
+               __entry->direct_vtimer          = arch_timer_ctx_index(map->direct_vtimer);
+               __entry->direct_ptimer =
+                       (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
+               __entry->emul_ptimer =
+                       (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
+       ),
+
+       TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d",
+                 __entry->vcpu_id,
+                 __entry->direct_vtimer,
+                 __entry->direct_ptimer,
+                 __entry->emul_ptimer)
+);
+
+TRACE_EVENT(kvm_timer_save_state,
+       TP_PROTO(struct arch_timer_context *ctx),
+       TP_ARGS(ctx),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,          ctl             )
+               __field(        unsigned long long,     cval            )
+               __field(        int,                    timer_idx       )
+       ),
+
+       TP_fast_assign(
+               __entry->ctl                    = ctx->cnt_ctl;
+               __entry->cval                   = ctx->cnt_cval;
+               __entry->timer_idx              = arch_timer_ctx_index(ctx);
+       ),
+
+       TP_printk("   CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
+                 __entry->ctl,
+                 __entry->cval,
+                 __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_restore_state,
+       TP_PROTO(struct arch_timer_context *ctx),
+       TP_ARGS(ctx),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,          ctl             )
+               __field(        unsigned long long,     cval            )
+               __field(        int,                    timer_idx       )
+       ),
+
+       TP_fast_assign(
+               __entry->ctl                    = ctx->cnt_ctl;
+               __entry->cval                   = ctx->cnt_cval;
+               __entry->timer_idx              = arch_timer_ctx_index(ctx);
+       ),
+
+       TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
+                 __entry->ctl,
+                 __entry->cval,
+                 __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_hrtimer_expire,
+       TP_PROTO(struct arch_timer_context *ctx),
+       TP_ARGS(ctx),
+
+       TP_STRUCT__entry(
+               __field(        int,                    timer_idx       )
+       ),
+
+       TP_fast_assign(
+               __entry->timer_idx              = arch_timer_ctx_index(ctx);
+       ),
+
+       TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_emulate,
+       TP_PROTO(struct arch_timer_context *ctx, bool should_fire),
+       TP_ARGS(ctx, should_fire),
+
+       TP_STRUCT__entry(
+               __field(        int,                    timer_idx       )
+               __field(        bool,                   should_fire     )
+       ),
+
+       TP_fast_assign(
+               __entry->timer_idx              = arch_timer_ctx_index(ctx);
+               __entry->should_fire            = should_fire;
+       ),
+
+       TP_printk("arch_timer_ctx_index: %d (should_fire: %d)",
+                 __entry->timer_idx, __entry->should_fire)
+);
+
+#endif /* _TRACE_ARM_ARM64_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace_arm
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h
new file mode 100644 (file)
index 0000000..2c56d1e
--- /dev/null
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_HANDLE_EXIT_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HANDLE_EXIT_ARM64_KVM_H
+
+#include <linux/tracepoint.h>
+#include "sys_regs.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+TRACE_EVENT(kvm_wfx_arm64,
+       TP_PROTO(unsigned long vcpu_pc, bool is_wfe),
+       TP_ARGS(vcpu_pc, is_wfe),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,  vcpu_pc)
+               __field(bool,           is_wfe)
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu_pc = vcpu_pc;
+               __entry->is_wfe  = is_wfe;
+       ),
+
+       TP_printk("guest executed wf%c at: 0x%08lx",
+                 __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_hvc_arm64,
+       TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm),
+       TP_ARGS(vcpu_pc, r0, imm),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, vcpu_pc)
+               __field(unsigned long, r0)
+               __field(unsigned long, imm)
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu_pc = vcpu_pc;
+               __entry->r0 = r0;
+               __entry->imm = imm;
+       ),
+
+       TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)",
+                 __entry->vcpu_pc, __entry->r0, __entry->imm)
+);
+
+TRACE_EVENT(kvm_arm_setup_debug,
+       TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+       TP_ARGS(vcpu, guest_debug),
+
+       TP_STRUCT__entry(
+               __field(struct kvm_vcpu *, vcpu)
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_clear_debug,
+       TP_PROTO(__u32 guest_debug),
+       TP_ARGS(guest_debug),
+
+       TP_STRUCT__entry(
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("flags: 0x%08x", __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_set_dreg32,
+       TP_PROTO(const char *name, __u32 value),
+       TP_ARGS(name, value),
+
+       TP_STRUCT__entry(
+               __field(const char *, name)
+               __field(__u32, value)
+       ),
+
+       TP_fast_assign(
+               __entry->name = name;
+               __entry->value = value;
+       ),
+
+       TP_printk("%s: 0x%08x", __entry->name, __entry->value)
+);
+
+TRACE_DEFINE_SIZEOF(__u64);
+
+TRACE_EVENT(kvm_arm_set_regset,
+       TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
+       TP_ARGS(type, len, control, value),
+       TP_STRUCT__entry(
+               __field(const char *, name)
+               __field(int, len)
+               __array(u64, ctrls, 16)
+               __array(u64, values, 16)
+       ),
+       TP_fast_assign(
+               __entry->name = type;
+               __entry->len = len;
+               memcpy(__entry->ctrls, control, len << 3);
+               memcpy(__entry->values, value, len << 3);
+       ),
+       TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
+               __print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
+               __print_array(__entry->values, __entry->len, sizeof(__u64)))
+);
+
+TRACE_EVENT(trap_reg,
+       TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
+       TP_ARGS(fn, reg, is_write, write_value),
+
+       TP_STRUCT__entry(
+               __field(const char *, fn)
+               __field(int, reg)
+               __field(bool, is_write)
+               __field(u64, write_value)
+       ),
+
+       TP_fast_assign(
+               __entry->fn = fn;
+               __entry->reg = reg;
+               __entry->is_write = is_write;
+               __entry->write_value = write_value;
+       ),
+
+       TP_printk("%s %s reg %d (0x%08llx)", __entry->fn,  __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
+);
+
+TRACE_EVENT(kvm_handle_sys_reg,
+       TP_PROTO(unsigned long hsr),
+       TP_ARGS(hsr),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,  hsr)
+       ),
+
+       TP_fast_assign(
+               __entry->hsr = hsr;
+       ),
+
+       TP_printk("HSR 0x%08lx", __entry->hsr)
+);
+
+TRACE_EVENT(kvm_sys_access,
+       TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg),
+       TP_ARGS(vcpu_pc, params, reg),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,                  vcpu_pc)
+               __field(bool,                           is_write)
+               __field(const char *,                   name)
+               __field(u8,                             Op0)
+               __field(u8,                             Op1)
+               __field(u8,                             CRn)
+               __field(u8,                             CRm)
+               __field(u8,                             Op2)
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu_pc = vcpu_pc;
+               __entry->is_write = params->is_write;
+               __entry->name = reg->name;
+               __entry->Op0 = reg->Op0;
+               __entry->Op0 = reg->Op0;
+               __entry->Op1 = reg->Op1;
+               __entry->CRn = reg->CRn;
+               __entry->CRm = reg->CRm;
+               __entry->Op2 = reg->Op2;
+       ),
+
+       TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s",
+                 __entry->vcpu_pc, __entry->name ?: "UNKN",
+                 __entry->Op0, __entry->Op1, __entry->CRn,
+                 __entry->CRm, __entry->Op2,
+                 __entry->is_write ? "write" : "read")
+);
+
+TRACE_EVENT(kvm_set_guest_debug,
+       TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+       TP_ARGS(vcpu, guest_debug),
+
+       TP_STRUCT__entry(
+               __field(struct kvm_vcpu *, vcpu)
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+#endif /* _TRACE_HANDLE_EXIT_ARM64_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace_handle_exit
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index e7d1ea92095ddd796354e3536577604a2f8851b4..2f92bdcb1188585c9826d0e6985bf25d95fe3bbf 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <asm/kvm_emulate.h>
-#include "vgic.h"
+#include "vgic/vgic.h"
 #include "sys_regs.h"
 
 static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
diff --git a/arch/arm64/kvm/vgic/trace.h b/arch/arm64/kvm/vgic/trace.h
new file mode 100644 (file)
index 0000000..83c6440
--- /dev/null
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VGIC_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+TRACE_EVENT(vgic_update_irq_pending,
+       TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
+       TP_ARGS(vcpu_id, irq, level),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  vcpu_id )
+               __field(        __u32,          irq     )
+               __field(        bool,           level   )
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu_id        = vcpu_id;
+               __entry->irq            = irq;
+               __entry->level          = level;
+       ),
+
+       TP_printk("VCPU: %ld, IRQ %d, level: %d",
+                 __entry->vcpu_id, __entry->irq, __entry->level)
+);
+
+#endif /* _TRACE_VGIC_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../arch/arm64/kvm/vgic
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
new file mode 100644 (file)
index 0000000..b13a9e3
--- /dev/null
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2016 Linaro
+ * Author: Christoffer Dall <christoffer.dall@linaro.org>
+ */
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/interrupt.h>
+#include <linux/kvm_host.h>
+#include <linux/seq_file.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+
+/*
+ * Structure to control looping through the entire vgic state.  We start at
+ * zero for each field and move upwards.  So, if dist_id is 0 we print the
+ * distributor info.  When dist_id is 1, we have already printed it and move
+ * on.
+ *
+ * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and
+ * so on.
+ */
+struct vgic_state_iter {
+       int nr_cpus;
+       int nr_spis;
+       int nr_lpis;
+       int dist_id;
+       int vcpu_id;
+       int intid;
+       int lpi_idx;
+       u32 *lpi_array;
+};
+
+static void iter_next(struct vgic_state_iter *iter)
+{
+       if (iter->dist_id == 0) {
+               iter->dist_id++;
+               return;
+       }
+
+       iter->intid++;
+       if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
+           ++iter->vcpu_id < iter->nr_cpus)
+               iter->intid = 0;
+
+       if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) {
+               if (iter->lpi_idx < iter->nr_lpis)
+                       iter->intid = iter->lpi_array[iter->lpi_idx];
+               iter->lpi_idx++;
+       }
+}
+
+static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
+                     loff_t pos)
+{
+       int nr_cpus = atomic_read(&kvm->online_vcpus);
+
+       memset(iter, 0, sizeof(*iter));
+
+       iter->nr_cpus = nr_cpus;
+       iter->nr_spis = kvm->arch.vgic.nr_spis;
+       if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+               iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array);
+               if (iter->nr_lpis < 0)
+                       iter->nr_lpis = 0;
+       }
+
+       /* Fast forward to the right position if needed */
+       while (pos--)
+               iter_next(iter);
+}
+
+static bool end_of_vgic(struct vgic_state_iter *iter)
+{
+       return iter->dist_id > 0 &&
+               iter->vcpu_id == iter->nr_cpus &&
+               iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) &&
+               iter->lpi_idx > iter->nr_lpis;
+}
+
+static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
+{
+       struct kvm *kvm = (struct kvm *)s->private;
+       struct vgic_state_iter *iter;
+
+       mutex_lock(&kvm->lock);
+       iter = kvm->arch.vgic.iter;
+       if (iter) {
+               iter = ERR_PTR(-EBUSY);
+               goto out;
+       }
+
+       iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter) {
+               iter = ERR_PTR(-ENOMEM);
+               goto out;
+       }
+
+       iter_init(kvm, iter, *pos);
+       kvm->arch.vgic.iter = iter;
+
+       if (end_of_vgic(iter))
+               iter = NULL;
+out:
+       mutex_unlock(&kvm->lock);
+       return iter;
+}
+
+static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct kvm *kvm = (struct kvm *)s->private;
+       struct vgic_state_iter *iter = kvm->arch.vgic.iter;
+
+       ++*pos;
+       iter_next(iter);
+       if (end_of_vgic(iter))
+               iter = NULL;
+       return iter;
+}
+
+static void vgic_debug_stop(struct seq_file *s, void *v)
+{
+       struct kvm *kvm = (struct kvm *)s->private;
+       struct vgic_state_iter *iter;
+
+       /*
+        * If the seq file wasn't properly opened, there's nothing to clearn
+        * up.
+        */
+       if (IS_ERR(v))
+               return;
+
+       mutex_lock(&kvm->lock);
+       iter = kvm->arch.vgic.iter;
+       kfree(iter->lpi_array);
+       kfree(iter);
+       kvm->arch.vgic.iter = NULL;
+       mutex_unlock(&kvm->lock);
+}
+
+static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
+{
+       bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3;
+
+       seq_printf(s, "Distributor\n");
+       seq_printf(s, "===========\n");
+       seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2");
+       seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
+       if (v3)
+               seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count);
+       seq_printf(s, "enabled:\t%d\n", dist->enabled);
+       seq_printf(s, "\n");
+
+       seq_printf(s, "P=pending_latch, L=line_level, A=active\n");
+       seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n");
+       seq_printf(s, "G=group\n");
+}
+
+static void print_header(struct seq_file *s, struct vgic_irq *irq,
+                        struct kvm_vcpu *vcpu)
+{
+       int id = 0;
+       char *hdr = "SPI ";
+
+       if (vcpu) {
+               hdr = "VCPU";
+               id = vcpu->vcpu_id;
+       }
+
+       seq_printf(s, "\n");
+       seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHCG     HWID   TARGET SRC PRI VCPU_ID\n", hdr, id);
+       seq_printf(s, "----------------------------------------------------------------\n");
+}
+
+static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
+                           struct kvm_vcpu *vcpu)
+{
+       char *type;
+       bool pending;
+
+       if (irq->intid < VGIC_NR_SGIS)
+               type = "SGI";
+       else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
+               type = "PPI";
+       else if (irq->intid < VGIC_MAX_SPI)
+               type = "SPI";
+       else
+               type = "LPI";
+
+       if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS)
+               print_header(s, irq, vcpu);
+
+       pending = irq->pending_latch;
+       if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+               int err;
+
+               err = irq_get_irqchip_state(irq->host_irq,
+                                           IRQCHIP_STATE_PENDING,
+                                           &pending);
+               WARN_ON_ONCE(err);
+       }
+
+       seq_printf(s, "       %s %4d "
+                     "    %2d "
+                     "%d%d%d%d%d%d%d "
+                     "%8d "
+                     "%8x "
+                     " %2x "
+                     "%3d "
+                     "     %2d "
+                     "\n",
+                       type, irq->intid,
+                       (irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1,
+                       pending,
+                       irq->line_level,
+                       irq->active,
+                       irq->enabled,
+                       irq->hw,
+                       irq->config == VGIC_CONFIG_LEVEL,
+                       irq->group,
+                       irq->hwintid,
+                       irq->mpidr,
+                       irq->source,
+                       irq->priority,
+                       (irq->vcpu) ? irq->vcpu->vcpu_id : -1);
+}
+
+static int vgic_debug_show(struct seq_file *s, void *v)
+{
+       struct kvm *kvm = (struct kvm *)s->private;
+       struct vgic_state_iter *iter = (struct vgic_state_iter *)v;
+       struct vgic_irq *irq;
+       struct kvm_vcpu *vcpu = NULL;
+       unsigned long flags;
+
+       if (iter->dist_id == 0) {
+               print_dist_state(s, &kvm->arch.vgic);
+               return 0;
+       }
+
+       if (!kvm->arch.vgic.initialized)
+               return 0;
+
+       if (iter->vcpu_id < iter->nr_cpus)
+               vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);
+
+       irq = vgic_get_irq(kvm, vcpu, iter->intid);
+       if (!irq) {
+               seq_printf(s, "       LPI %4d freed\n", iter->intid);
+               return 0;
+       }
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       print_irq_state(s, irq, vcpu);
+       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+       vgic_put_irq(kvm, irq);
+       return 0;
+}
+
+static const struct seq_operations vgic_debug_seq_ops = {
+       .start = vgic_debug_start,
+       .next  = vgic_debug_next,
+       .stop  = vgic_debug_stop,
+       .show  = vgic_debug_show
+};
+
+static int debug_open(struct inode *inode, struct file *file)
+{
+       int ret;
+       ret = seq_open(file, &vgic_debug_seq_ops);
+       if (!ret) {
+               struct seq_file *seq;
+               /* seq_open will have modified file->private_data */
+               seq = file->private_data;
+               seq->private = inode->i_private;
+       }
+
+       return ret;
+};
+
+static const struct file_operations vgic_debug_fops = {
+       .owner   = THIS_MODULE,
+       .open    = debug_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release
+};
+
+void vgic_debug_init(struct kvm *kvm)
+{
+       debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm,
+                           &vgic_debug_fops);
+}
+
+void vgic_debug_destroy(struct kvm *kvm)
+{
+}
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
new file mode 100644 (file)
index 0000000..32e32d6
--- /dev/null
@@ -0,0 +1,556 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+
+/*
+ * Initialization rules: there are multiple stages to the vgic
+ * initialization, both for the distributor and the CPU interfaces.  The basic
+ * idea is that even though the VGIC is not functional or not requested from
+ * user space, the critical path of the run loop can still call VGIC functions
+ * that just won't do anything, without them having to check additional
+ * initialization flags to ensure they don't look at uninitialized data
+ * structures.
+ *
+ * Distributor:
+ *
+ * - kvm_vgic_early_init(): initialization of static data that doesn't
+ *   depend on any sizing information or emulation type. No allocation
+ *   is allowed there.
+ *
+ * - vgic_init(): allocation and initialization of the generic data
+ *   structures that depend on sizing information (number of CPUs,
+ *   number of interrupts). Also initializes the vcpu specific data
+ *   structures. Can be executed lazily for GICv2.
+ *
+ * CPU Interface:
+ *
+ * - kvm_vgic_vcpu_init(): initialization of static data that
+ *   doesn't depend on any sizing information or emulation type. No
+ *   allocation is allowed there.
+ */
+
+/* EARLY INIT */
+
+/**
+ * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures
+ * @kvm: The VM whose VGIC districutor should be initialized
+ *
+ * Only do initialization of static structures that don't require any
+ * allocation or sizing information from userspace.  vgic_init() called
+ * kvm_vgic_dist_init() which takes care of the rest.
+ */
+void kvm_vgic_early_init(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+
+       INIT_LIST_HEAD(&dist->lpi_list_head);
+       INIT_LIST_HEAD(&dist->lpi_translation_cache);
+       raw_spin_lock_init(&dist->lpi_list_lock);
+}
+
+/* CREATION */
+
+/**
+ * kvm_vgic_create: triggered by the instantiation of the VGIC device by
+ * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
+ * or through the generic KVM_CREATE_DEVICE API ioctl.
+ * irqchip_in_kernel() tells you if this function succeeded or not.
+ * @kvm: kvm struct pointer
+ * @type: KVM_DEV_TYPE_ARM_VGIC_V[23]
+ */
+int kvm_vgic_create(struct kvm *kvm, u32 type)
+{
+       int i, ret;
+       struct kvm_vcpu *vcpu;
+
+       if (irqchip_in_kernel(kvm))
+               return -EEXIST;
+
+       /*
+        * This function is also called by the KVM_CREATE_IRQCHIP handler,
+        * which had no chance yet to check the availability of the GICv2
+        * emulation. So check this here again. KVM_CREATE_DEVICE does
+        * the proper checks already.
+        */
+       if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+               !kvm_vgic_global_state.can_emulate_gicv2)
+               return -ENODEV;
+
+       ret = -EBUSY;
+       if (!lock_all_vcpus(kvm))
+               return ret;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (vcpu->arch.has_run_once)
+                       goto out_unlock;
+       }
+       ret = 0;
+
+       if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+               kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
+       else
+               kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
+
+       if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
+               ret = -E2BIG;
+               goto out_unlock;
+       }
+
+       kvm->arch.vgic.in_kernel = true;
+       kvm->arch.vgic.vgic_model = type;
+
+       kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
+
+       if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+               kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
+       else
+               INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
+
+out_unlock:
+       unlock_all_vcpus(kvm);
+       return ret;
+}
+
+/* INIT/DESTROY */
+
+/**
+ * kvm_vgic_dist_init: initialize the dist data structures
+ * @kvm: kvm struct pointer
+ * @nr_spis: number of spis, frozen by caller
+ */
+static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
+       int i;
+
+       dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
+       if (!dist->spis)
+               return  -ENOMEM;
+
+       /*
+        * In the following code we do not take the irq struct lock since
+        * no other action on irq structs can happen while the VGIC is
+        * not initialized yet:
+        * If someone wants to inject an interrupt or does a MMIO access, we
+        * require prior initialization in case of a virtual GICv3 or trigger
+        * initialization when using a virtual GICv2.
+        */
+       for (i = 0; i < nr_spis; i++) {
+               struct vgic_irq *irq = &dist->spis[i];
+
+               irq->intid = i + VGIC_NR_PRIVATE_IRQS;
+               INIT_LIST_HEAD(&irq->ap_list);
+               raw_spin_lock_init(&irq->irq_lock);
+               irq->vcpu = NULL;
+               irq->target_vcpu = vcpu0;
+               kref_init(&irq->refcount);
+               switch (dist->vgic_model) {
+               case KVM_DEV_TYPE_ARM_VGIC_V2:
+                       irq->targets = 0;
+                       irq->group = 0;
+                       break;
+               case KVM_DEV_TYPE_ARM_VGIC_V3:
+                       irq->mpidr = 0;
+                       irq->group = 1;
+                       break;
+               default:
+                       kfree(dist->spis);
+                       dist->spis = NULL;
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+
+/**
+ * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
+ * structures and register VCPU-specific KVM iodevs
+ *
+ * @vcpu: pointer to the VCPU being created and initialized
+ *
+ * Only do initialization, but do not actually enable the
+ * VGIC CPU interface
+ */
+int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       int ret = 0;
+       int i;
+
+       vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
+
+       INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+       raw_spin_lock_init(&vgic_cpu->ap_list_lock);
+       atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
+
+       /*
+        * Enable and configure all SGIs to be edge-triggered and
+        * configure all PPIs as level-triggered.
+        */
+       for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+               struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+
+               INIT_LIST_HEAD(&irq->ap_list);
+               raw_spin_lock_init(&irq->irq_lock);
+               irq->intid = i;
+               irq->vcpu = NULL;
+               irq->target_vcpu = vcpu;
+               kref_init(&irq->refcount);
+               if (vgic_irq_is_sgi(i)) {
+                       /* SGIs */
+                       irq->enabled = 1;
+                       irq->config = VGIC_CONFIG_EDGE;
+               } else {
+                       /* PPIs */
+                       irq->config = VGIC_CONFIG_LEVEL;
+               }
+       }
+
+       if (!irqchip_in_kernel(vcpu->kvm))
+               return 0;
+
+       /*
+        * If we are creating a VCPU with a GICv3 we must also register the
+        * KVM io device for the redistributor that belongs to this VCPU.
+        */
+       if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+               mutex_lock(&vcpu->kvm->lock);
+               ret = vgic_register_redist_iodev(vcpu);
+               mutex_unlock(&vcpu->kvm->lock);
+       }
+       return ret;
+}
+
+static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_enable(vcpu);
+       else
+               vgic_v3_enable(vcpu);
+}
+
+/*
+ * vgic_init: allocates and initializes dist and vcpu data structures
+ * depending on two dimensioning parameters:
+ * - the number of spis
+ * - the number of vcpus
+ * The function is generally called when nr_spis has been explicitly set
+ * by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
+ * vgic_initialized() returns true when this function has succeeded.
+ * Must be called with kvm->lock held!
+ */
+int vgic_init(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct kvm_vcpu *vcpu;
+       int ret = 0, i, idx;
+
+       if (vgic_initialized(kvm))
+               return 0;
+
+       /* Are we also in the middle of creating a VCPU? */
+       if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
+               return -EBUSY;
+
+       /* freeze the number of spis */
+       if (!dist->nr_spis)
+               dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;
+
+       ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
+       if (ret)
+               goto out;
+
+       /* Initialize groups on CPUs created before the VGIC type was known */
+       kvm_for_each_vcpu(idx, vcpu, kvm) {
+               struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+               for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+                       struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+                       switch (dist->vgic_model) {
+                       case KVM_DEV_TYPE_ARM_VGIC_V3:
+                               irq->group = 1;
+                               irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+                               break;
+                       case KVM_DEV_TYPE_ARM_VGIC_V2:
+                               irq->group = 0;
+                               irq->targets = 1U << idx;
+                               break;
+                       default:
+                               ret = -EINVAL;
+                               goto out;
+                       }
+               }
+       }
+
+       if (vgic_has_its(kvm))
+               vgic_lpi_translation_cache_init(kvm);
+
+       /*
+        * If we have GICv4.1 enabled, unconditionnaly request enable the
+        * v4 support so that we get HW-accelerated vSGIs. Otherwise, only
+        * enable it if we present a virtual ITS to the guest.
+        */
+       if (vgic_supports_direct_msis(kvm)) {
+               ret = vgic_v4_init(kvm);
+               if (ret)
+                       goto out;
+       }
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_vgic_vcpu_enable(vcpu);
+
+       ret = kvm_vgic_setup_default_irq_routing(kvm);
+       if (ret)
+               goto out;
+
+       vgic_debug_init(kvm);
+
+       dist->implementation_rev = 2;
+       dist->initialized = true;
+
+out:
+       return ret;
+}
+
+static void kvm_vgic_dist_destroy(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_redist_region *rdreg, *next;
+
+       dist->ready = false;
+       dist->initialized = false;
+
+       kfree(dist->spis);
+       dist->spis = NULL;
+       dist->nr_spis = 0;
+
+       if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+               list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
+                       list_del(&rdreg->list);
+                       kfree(rdreg);
+               }
+               INIT_LIST_HEAD(&dist->rd_regions);
+       }
+
+       if (vgic_has_its(kvm))
+               vgic_lpi_translation_cache_destroy(kvm);
+
+       if (vgic_supports_direct_msis(kvm))
+               vgic_v4_teardown(kvm);
+}
+
+void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       /*
+        * Retire all pending LPIs on this vcpu anyway as we're
+        * going to destroy it.
+        */
+       vgic_flush_pending_lpis(vcpu);
+
+       INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+}
+
+/* To be called with kvm->lock held */
+static void __kvm_vgic_destroy(struct kvm *kvm)
+{
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       vgic_debug_destroy(kvm);
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_vgic_vcpu_destroy(vcpu);
+
+       kvm_vgic_dist_destroy(kvm);
+}
+
+void kvm_vgic_destroy(struct kvm *kvm)
+{
+       mutex_lock(&kvm->lock);
+       __kvm_vgic_destroy(kvm);
+       mutex_unlock(&kvm->lock);
+}
+
+/**
+ * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest
+ * is a GICv2. A GICv3 must be explicitly initialized by the guest using the
+ * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group.
+ * @kvm: kvm struct pointer
+ */
+int vgic_lazy_init(struct kvm *kvm)
+{
+       int ret = 0;
+
+       if (unlikely(!vgic_initialized(kvm))) {
+               /*
+                * We only provide the automatic initialization of the VGIC
+                * for the legacy case of a GICv2. Any other type must
+                * be explicitly initialized once setup with the respective
+                * KVM device call.
+                */
+               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
+                       return -EBUSY;
+
+               mutex_lock(&kvm->lock);
+               ret = vgic_init(kvm);
+               mutex_unlock(&kvm->lock);
+       }
+
+       return ret;
+}
+
+/* RESOURCE MAPPING */
+
+/**
+ * Map the MMIO regions depending on the VGIC model exposed to the guest
+ * called on the first VCPU run.
+ * Also map the virtual CPU interface into the VM.
+ * v2/v3 derivatives call vgic_init if not already done.
+ * vgic_ready() returns true if this function has succeeded.
+ * @kvm: kvm struct pointer
+ */
+int kvm_vgic_map_resources(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       int ret = 0;
+
+       mutex_lock(&kvm->lock);
+       if (!irqchip_in_kernel(kvm))
+               goto out;
+
+       if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
+               ret = vgic_v2_map_resources(kvm);
+       else
+               ret = vgic_v3_map_resources(kvm);
+
+       if (ret)
+               __kvm_vgic_destroy(kvm);
+
+out:
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+/* GENERIC PROBE */
+
+static int vgic_init_cpu_starting(unsigned int cpu)
+{
+       enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
+       return 0;
+}
+
+
+static int vgic_init_cpu_dying(unsigned int cpu)
+{
+       disable_percpu_irq(kvm_vgic_global_state.maint_irq);
+       return 0;
+}
+
+static irqreturn_t vgic_maintenance_handler(int irq, void *data)
+{
+       /*
+        * We cannot rely on the vgic maintenance interrupt to be
+        * delivered synchronously. This means we can only use it to
+        * exit the VM, and we perform the handling of EOIed
+        * interrupts on the exit path (see vgic_fold_lr_state).
+        */
+       return IRQ_HANDLED;
+}
+
+/**
+ * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
+ *
+ * For a specific CPU, initialize the GIC VE hardware.
+ */
+void kvm_vgic_init_cpu_hardware(void)
+{
+       BUG_ON(preemptible());
+
+       /*
+        * We want to make sure the list registers start out clear so that we
+        * only have the program the used registers.
+        */
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_init_lrs();
+       else
+               kvm_call_hyp(__vgic_v3_init_lrs);
+}
+
+/**
+ * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable
+ * according to the host GIC model. Accordingly calls either
+ * vgic_v2/v3_probe which registers the KVM_DEVICE that can be
+ * instantiated by a guest later on .
+ */
+int kvm_vgic_hyp_init(void)
+{
+       const struct gic_kvm_info *gic_kvm_info;
+       int ret;
+
+       gic_kvm_info = gic_get_kvm_info();
+       if (!gic_kvm_info)
+               return -ENODEV;
+
+       if (!gic_kvm_info->maint_irq) {
+               kvm_err("No vgic maintenance irq\n");
+               return -ENXIO;
+       }
+
+       switch (gic_kvm_info->type) {
+       case GIC_V2:
+               ret = vgic_v2_probe(gic_kvm_info);
+               break;
+       case GIC_V3:
+               ret = vgic_v3_probe(gic_kvm_info);
+               if (!ret) {
+                       static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
+                       kvm_info("GIC system register CPU interface enabled\n");
+               }
+               break;
+       default:
+               ret = -ENODEV;
+       }
+
+       if (ret)
+               return ret;
+
+       kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
+       ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
+                                vgic_maintenance_handler,
+                                "vgic", kvm_get_running_vcpus());
+       if (ret) {
+               kvm_err("Cannot register interrupt %d\n",
+                       kvm_vgic_global_state.maint_irq);
+               return ret;
+       }
+
+       ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
+                               "kvm/arm/vgic:starting",
+                               vgic_init_cpu_starting, vgic_init_cpu_dying);
+       if (ret) {
+               kvm_err("Cannot register vgic CPU notifier\n");
+               goto out_free_irq;
+       }
+
+       kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
+       return 0;
+
+out_free_irq:
+       free_percpu_irq(kvm_vgic_global_state.maint_irq,
+                       kvm_get_running_vcpus());
+       return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c
new file mode 100644 (file)
index 0000000..d8cdfea
--- /dev/null
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <trace/events/kvm.h>
+#include <kvm/arm_vgic.h>
+#include "vgic.h"
+
+/**
+ * vgic_irqfd_set_irq: inject the IRQ corresponding to the
+ * irqchip routing entry
+ *
+ * This is the entry point for irqfd IRQ injection
+ */
+static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
+                       struct kvm *kvm, int irq_source_id,
+                       int level, bool line_status)
+{
+       unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS;
+
+       if (!vgic_valid_spi(kvm, spi_id))
+               return -EINVAL;
+       return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL);
+}
+
+/**
+ * kvm_set_routing_entry: populate a kvm routing entry
+ * from a user routing entry
+ *
+ * @kvm: the VM this entry is applied to
+ * @e: kvm kernel routing entry handle
+ * @ue: user api routing entry handle
+ * return 0 on success, -EINVAL on errors.
+ */
+int kvm_set_routing_entry(struct kvm *kvm,
+                         struct kvm_kernel_irq_routing_entry *e,
+                         const struct kvm_irq_routing_entry *ue)
+{
+       int r = -EINVAL;
+
+       switch (ue->type) {
+       case KVM_IRQ_ROUTING_IRQCHIP:
+               e->set = vgic_irqfd_set_irq;
+               e->irqchip.irqchip = ue->u.irqchip.irqchip;
+               e->irqchip.pin = ue->u.irqchip.pin;
+               if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) ||
+                   (e->irqchip.irqchip >= KVM_NR_IRQCHIPS))
+                       goto out;
+               break;
+       case KVM_IRQ_ROUTING_MSI:
+               e->set = kvm_set_msi;
+               e->msi.address_lo = ue->u.msi.address_lo;
+               e->msi.address_hi = ue->u.msi.address_hi;
+               e->msi.data = ue->u.msi.data;
+               e->msi.flags = ue->flags;
+               e->msi.devid = ue->u.msi.devid;
+               break;
+       default:
+               goto out;
+       }
+       r = 0;
+out:
+       return r;
+}
+
+static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e,
+                            struct kvm_msi *msi)
+{
+       msi->address_lo = e->msi.address_lo;
+       msi->address_hi = e->msi.address_hi;
+       msi->data = e->msi.data;
+       msi->flags = e->msi.flags;
+       msi->devid = e->msi.devid;
+}
+/**
+ * kvm_set_msi: inject the MSI corresponding to the
+ * MSI routing entry
+ *
+ * This is the entry point for irqfd MSI injection
+ * and userspace MSI injection.
+ */
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+               struct kvm *kvm, int irq_source_id,
+               int level, bool line_status)
+{
+       struct kvm_msi msi;
+
+       if (!vgic_has_its(kvm))
+               return -ENODEV;
+
+       if (!level)
+               return -1;
+
+       kvm_populate_msi(e, &msi);
+       return vgic_its_inject_msi(kvm, &msi);
+}
+
+/**
+ * kvm_arch_set_irq_inatomic: fast-path for irqfd injection
+ *
+ * Currently only direct MSI injection is supported.
+ */
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+                             struct kvm *kvm, int irq_source_id, int level,
+                             bool line_status)
+{
+       if (e->type == KVM_IRQ_ROUTING_MSI && vgic_has_its(kvm) && level) {
+               struct kvm_msi msi;
+
+               kvm_populate_msi(e, &msi);
+               if (!vgic_its_inject_cached_translation(kvm, &msi))
+                       return 0;
+       }
+
+       return -EWOULDBLOCK;
+}
+
+int kvm_vgic_setup_default_irq_routing(struct kvm *kvm)
+{
+       struct kvm_irq_routing_entry *entries;
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       u32 nr = dist->nr_spis;
+       int i, ret;
+
+       entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL);
+       if (!entries)
+               return -ENOMEM;
+
+       for (i = 0; i < nr; i++) {
+               entries[i].gsi = i;
+               entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
+               entries[i].u.irqchip.irqchip = 0;
+               entries[i].u.irqchip.pin = i;
+       }
+       ret = kvm_set_irq_routing(kvm, entries, nr, 0);
+       kfree(entries);
+       return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
new file mode 100644 (file)
index 0000000..c012a52
--- /dev/null
@@ -0,0 +1,2783 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * GICv3 ITS emulation
+ *
+ * Copyright (C) 2015,2016 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/uaccess.h>
+#include <linux/list_sort.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+static int vgic_its_save_tables_v0(struct vgic_its *its);
+static int vgic_its_restore_tables_v0(struct vgic_its *its);
+static int vgic_its_commit_v0(struct vgic_its *its);
+static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
+                            struct kvm_vcpu *filter_vcpu, bool needs_inv);
+
+/*
+ * Creates a new (reference to a) struct vgic_irq for a given LPI.
+ * If this LPI is already mapped on another ITS, we increase its refcount
+ * and return a pointer to the existing structure.
+ * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
+ * This function returns a pointer to the _unlocked_ structure.
+ */
+static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
+                                    struct kvm_vcpu *vcpu)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
+       unsigned long flags;
+       int ret;
+
+       /* In this case there is no put, since we keep the reference. */
+       if (irq)
+               return irq;
+
+       irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL);
+       if (!irq)
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&irq->lpi_list);
+       INIT_LIST_HEAD(&irq->ap_list);
+       raw_spin_lock_init(&irq->irq_lock);
+
+       irq->config = VGIC_CONFIG_EDGE;
+       kref_init(&irq->refcount);
+       irq->intid = intid;
+       irq->target_vcpu = vcpu;
+       irq->group = 1;
+
+       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+       /*
+        * There could be a race with another vgic_add_lpi(), so we need to
+        * check that we don't add a second list entry with the same LPI.
+        */
+       list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
+               if (oldirq->intid != intid)
+                       continue;
+
+               /* Someone was faster with adding this LPI, lets use that. */
+               kfree(irq);
+               irq = oldirq;
+
+               /*
+                * This increases the refcount, the caller is expected to
+                * call vgic_put_irq() on the returned pointer once it's
+                * finished with the IRQ.
+                */
+               vgic_get_irq_kref(irq);
+
+               goto out_unlock;
+       }
+
+       list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
+       dist->lpi_list_count++;
+
+out_unlock:
+       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+       /*
+        * We "cache" the configuration table entries in our struct vgic_irq's.
+        * However we only have those structs for mapped IRQs, so we read in
+        * the respective config data from memory here upon mapping the LPI.
+        *
+        * Should any of these fail, behave as if we couldn't create the LPI
+        * by dropping the refcount and returning the error.
+        */
+       ret = update_lpi_config(kvm, irq, NULL, false);
+       if (ret) {
+               vgic_put_irq(kvm, irq);
+               return ERR_PTR(ret);
+       }
+
+       ret = vgic_v3_lpi_sync_pending_status(kvm, irq);
+       if (ret) {
+               vgic_put_irq(kvm, irq);
+               return ERR_PTR(ret);
+       }
+
+       return irq;
+}
+
+struct its_device {
+       struct list_head dev_list;
+
+       /* the head for the list of ITTEs */
+       struct list_head itt_head;
+       u32 num_eventid_bits;
+       gpa_t itt_addr;
+       u32 device_id;
+};
+
+#define COLLECTION_NOT_MAPPED ((u32)~0)
+
+struct its_collection {
+       struct list_head coll_list;
+
+       u32 collection_id;
+       u32 target_addr;
+};
+
+#define its_is_collection_mapped(coll) ((coll) && \
+                               ((coll)->target_addr != COLLECTION_NOT_MAPPED))
+
+struct its_ite {
+       struct list_head ite_list;
+
+       struct vgic_irq *irq;
+       struct its_collection *collection;
+       u32 event_id;
+};
+
+struct vgic_translation_cache_entry {
+       struct list_head        entry;
+       phys_addr_t             db;
+       u32                     devid;
+       u32                     eventid;
+       struct vgic_irq         *irq;
+};
+
+/**
+ * struct vgic_its_abi - ITS abi ops and settings
+ * @cte_esz: collection table entry size
+ * @dte_esz: device table entry size
+ * @ite_esz: interrupt translation table entry size
+ * @save tables: save the ITS tables into guest RAM
+ * @restore_tables: restore the ITS internal structs from tables
+ *  stored in guest RAM
+ * @commit: initialize the registers which expose the ABI settings,
+ *  especially the entry sizes
+ */
+struct vgic_its_abi {
+       int cte_esz;
+       int dte_esz;
+       int ite_esz;
+       int (*save_tables)(struct vgic_its *its);
+       int (*restore_tables)(struct vgic_its *its);
+       int (*commit)(struct vgic_its *its);
+};
+
+#define ABI_0_ESZ      8
+#define ESZ_MAX                ABI_0_ESZ
+
+static const struct vgic_its_abi its_table_abi_versions[] = {
+       [0] = {
+        .cte_esz = ABI_0_ESZ,
+        .dte_esz = ABI_0_ESZ,
+        .ite_esz = ABI_0_ESZ,
+        .save_tables = vgic_its_save_tables_v0,
+        .restore_tables = vgic_its_restore_tables_v0,
+        .commit = vgic_its_commit_v0,
+       },
+};
+
+#define NR_ITS_ABIS    ARRAY_SIZE(its_table_abi_versions)
+
+inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its)
+{
+       return &its_table_abi_versions[its->abi_rev];
+}
+
+static int vgic_its_set_abi(struct vgic_its *its, u32 rev)
+{
+       const struct vgic_its_abi *abi;
+
+       its->abi_rev = rev;
+       abi = vgic_its_get_abi(its);
+       return abi->commit(its);
+}
+
+/*
+ * Find and returns a device in the device table for an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
+{
+       struct its_device *device;
+
+       list_for_each_entry(device, &its->device_list, dev_list)
+               if (device_id == device->device_id)
+                       return device;
+
+       return NULL;
+}
+
+/*
+ * Find and returns an interrupt translation table entry (ITTE) for a given
+ * Device ID/Event ID pair on an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
+                                 u32 event_id)
+{
+       struct its_device *device;
+       struct its_ite *ite;
+
+       device = find_its_device(its, device_id);
+       if (device == NULL)
+               return NULL;
+
+       list_for_each_entry(ite, &device->itt_head, ite_list)
+               if (ite->event_id == event_id)
+                       return ite;
+
+       return NULL;
+}
+
+/* To be used as an iterator this macro misses the enclosing parentheses */
+#define for_each_lpi_its(dev, ite, its) \
+       list_for_each_entry(dev, &(its)->device_list, dev_list) \
+               list_for_each_entry(ite, &(dev)->itt_head, ite_list)
+
+#define GIC_LPI_OFFSET 8192
+
+#define VITS_TYPER_IDBITS 16
+#define VITS_TYPER_DEVBITS 16
+#define VITS_DTE_MAX_DEVID_OFFSET      (BIT(14) - 1)
+#define VITS_ITE_MAX_EVENTID_OFFSET    (BIT(16) - 1)
+
+/*
+ * Finds and returns a collection in the ITS collection table.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
+{
+       struct its_collection *collection;
+
+       list_for_each_entry(collection, &its->collection_list, coll_list) {
+               if (coll_id == collection->collection_id)
+                       return collection;
+       }
+
+       return NULL;
+}
+
+#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED)
+#define LPI_PROP_PRIORITY(p)   ((p) & 0xfc)
+
+/*
+ * Reads the configuration data for a given LPI from guest memory and
+ * updates the fields in struct vgic_irq.
+ * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
+ * VCPU. Unconditionally applies if filter_vcpu is NULL.
+ */
+static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
+                            struct kvm_vcpu *filter_vcpu, bool needs_inv)
+{
+       u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
+       u8 prop;
+       int ret;
+       unsigned long flags;
+
+       ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
+                                 &prop, 1);
+
+       if (ret)
+               return ret;
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+       if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
+               irq->priority = LPI_PROP_PRIORITY(prop);
+               irq->enabled = LPI_PROP_ENABLE_BIT(prop);
+
+               if (!irq->hw) {
+                       vgic_queue_irq_unlock(kvm, irq, flags);
+                       return 0;
+               }
+       }
+
+       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+       if (irq->hw)
+               return its_prop_update_vlpi(irq->host_irq, prop, needs_inv);
+
+       return 0;
+}
+
+/*
+ * Create a snapshot of the current LPIs targeting @vcpu, so that we can
+ * enumerate those LPIs without holding any lock.
+ * Returns their number and puts the kmalloc'ed array into intid_ptr.
+ */
+int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_irq *irq;
+       unsigned long flags;
+       u32 *intids;
+       int irq_count, i = 0;
+
+       /*
+        * There is an obvious race between allocating the array and LPIs
+        * being mapped/unmapped. If we ended up here as a result of a
+        * command, we're safe (locks are held, preventing another
+        * command). If coming from another path (such as enabling LPIs),
+        * we must be careful not to overrun the array.
+        */
+       irq_count = READ_ONCE(dist->lpi_list_count);
+       intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
+       if (!intids)
+               return -ENOMEM;
+
+       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+               if (i == irq_count)
+                       break;
+               /* We don't need to "get" the IRQ, as we hold the list lock. */
+               if (vcpu && irq->target_vcpu != vcpu)
+                       continue;
+               intids[i++] = irq->intid;
+       }
+       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+       *intid_ptr = intids;
+       return i;
+}
+
+static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
+{
+       int ret = 0;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       irq->target_vcpu = vcpu;
+       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+       if (irq->hw) {
+               struct its_vlpi_map map;
+
+               ret = its_get_vlpi(irq->host_irq, &map);
+               if (ret)
+                       return ret;
+
+               if (map.vpe)
+                       atomic_dec(&map.vpe->vlpi_count);
+               map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+               atomic_inc(&map.vpe->vlpi_count);
+
+               ret = its_map_vlpi(irq->host_irq, &map);
+       }
+
+       return ret;
+}
+
+/*
+ * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
+ * is targeting) to the VGIC's view, which deals with target VCPUs.
+ * Needs to be called whenever either the collection for a LPIs has
+ * changed or the collection itself got retargeted.
+ */
+static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite)
+{
+       struct kvm_vcpu *vcpu;
+
+       if (!its_is_collection_mapped(ite->collection))
+               return;
+
+       vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
+       update_affinity(ite->irq, vcpu);
+}
+
+/*
+ * Updates the target VCPU for every LPI targeting this collection.
+ * Must be called with the its_lock mutex held.
+ */
+static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
+                                      struct its_collection *coll)
+{
+       struct its_device *device;
+       struct its_ite *ite;
+
+       for_each_lpi_its(device, ite, its) {
+               if (!ite->collection || coll != ite->collection)
+                       continue;
+
+               update_affinity_ite(kvm, ite);
+       }
+}
+
+static u32 max_lpis_propbaser(u64 propbaser)
+{
+       int nr_idbits = (propbaser & 0x1f) + 1;
+
+       return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
+}
+
+/*
+ * Sync the pending table pending bit of LPIs targeting @vcpu
+ * with our own data structures. This relies on the LPI being
+ * mapped before.
+ */
+static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
+{
+       gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+       struct vgic_irq *irq;
+       int last_byte_offset = -1;
+       int ret = 0;
+       u32 *intids;
+       int nr_irqs, i;
+       unsigned long flags;
+       u8 pendmask;
+
+       nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids);
+       if (nr_irqs < 0)
+               return nr_irqs;
+
+       for (i = 0; i < nr_irqs; i++) {
+               int byte_offset, bit_nr;
+
+               byte_offset = intids[i] / BITS_PER_BYTE;
+               bit_nr = intids[i] % BITS_PER_BYTE;
+
+               /*
+                * For contiguously allocated LPIs chances are we just read
+                * this very same byte in the last iteration. Reuse that.
+                */
+               if (byte_offset != last_byte_offset) {
+                       ret = kvm_read_guest_lock(vcpu->kvm,
+                                                 pendbase + byte_offset,
+                                                 &pendmask, 1);
+                       if (ret) {
+                               kfree(intids);
+                               return ret;
+                       }
+                       last_byte_offset = byte_offset;
+               }
+
+               irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               irq->pending_latch = pendmask & (1U << bit_nr);
+               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       kfree(intids);
+
+       return ret;
+}
+
+static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
+                                             struct vgic_its *its,
+                                             gpa_t addr, unsigned int len)
+{
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       u64 reg = GITS_TYPER_PLPIS;
+
+       /*
+        * We use linear CPU numbers for redistributor addressing,
+        * so GITS_TYPER.PTA is 0.
+        * Also we force all PROPBASER registers to be the same, so
+        * CommonLPIAff is 0 as well.
+        * To avoid memory waste in the guest, we keep the number of IDBits and
+        * DevBits low - as least for the time being.
+        */
+       reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT;
+       reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT;
+       reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT;
+
+       return extract_bytes(reg, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
+                                            struct vgic_its *its,
+                                            gpa_t addr, unsigned int len)
+{
+       u32 val;
+
+       val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK;
+       val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM;
+       return val;
+}
+
+static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm,
+                                           struct vgic_its *its,
+                                           gpa_t addr, unsigned int len,
+                                           unsigned long val)
+{
+       u32 rev = GITS_IIDR_REV(val);
+
+       if (rev >= NR_ITS_ABIS)
+               return -EINVAL;
+       return vgic_its_set_abi(its, rev);
+}
+
+static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
+                                              struct vgic_its *its,
+                                              gpa_t addr, unsigned int len)
+{
+       switch (addr & 0xffff) {
+       case GITS_PIDR0:
+               return 0x92;    /* part number, bits[7:0] */
+       case GITS_PIDR1:
+               return 0xb4;    /* part number, bits[11:8] */
+       case GITS_PIDR2:
+               return GIC_PIDR2_ARCH_GICv3 | 0x0b;
+       case GITS_PIDR4:
+               return 0x40;    /* This is a 64K software visible page */
+       /* The following are the ID registers for (any) GIC. */
+       case GITS_CIDR0:
+               return 0x0d;
+       case GITS_CIDR1:
+               return 0xf0;
+       case GITS_CIDR2:
+               return 0x05;
+       case GITS_CIDR3:
+               return 0xb1;
+       }
+
+       return 0;
+}
+
+static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
+                                              phys_addr_t db,
+                                              u32 devid, u32 eventid)
+{
+       struct vgic_translation_cache_entry *cte;
+
+       list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
+               /*
+                * If we hit a NULL entry, there is nothing after this
+                * point.
+                */
+               if (!cte->irq)
+                       break;
+
+               if (cte->db != db || cte->devid != devid ||
+                   cte->eventid != eventid)
+                       continue;
+
+               /*
+                * Move this entry to the head, as it is the most
+                * recently used.
+                */
+               if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
+                       list_move(&cte->entry, &dist->lpi_translation_cache);
+
+               return cte->irq;
+       }
+
+       return NULL;
+}
+
+static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
+                                            u32 devid, u32 eventid)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_irq *irq;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+       irq = __vgic_its_check_cache(dist, db, devid, eventid);
+       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+       return irq;
+}
+
+static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
+                                      u32 devid, u32 eventid,
+                                      struct vgic_irq *irq)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_translation_cache_entry *cte;
+       unsigned long flags;
+       phys_addr_t db;
+
+       /* Do not cache a directly injected interrupt */
+       if (irq->hw)
+               return;
+
+       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+       if (unlikely(list_empty(&dist->lpi_translation_cache)))
+               goto out;
+
+       /*
+        * We could have raced with another CPU caching the same
+        * translation behind our back, so let's check it is not in
+        * already
+        */
+       db = its->vgic_its_base + GITS_TRANSLATER;
+       if (__vgic_its_check_cache(dist, db, devid, eventid))
+               goto out;
+
+       /* Always reuse the last entry (LRU policy) */
+       cte = list_last_entry(&dist->lpi_translation_cache,
+                             typeof(*cte), entry);
+
+       /*
+        * Caching the translation implies having an extra reference
+        * to the interrupt, so drop the potential reference on what
+        * was in the cache, and increment it on the new interrupt.
+        */
+       if (cte->irq)
+               __vgic_put_lpi_locked(kvm, cte->irq);
+
+       vgic_get_irq_kref(irq);
+
+       cte->db         = db;
+       cte->devid      = devid;
+       cte->eventid    = eventid;
+       cte->irq        = irq;
+
+       /* Move the new translation to the head of the list */
+       list_move(&cte->entry, &dist->lpi_translation_cache);
+
+out:
+       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+}
+
+void vgic_its_invalidate_cache(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_translation_cache_entry *cte;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+       list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
+               /*
+                * If we hit a NULL entry, there is nothing after this
+                * point.
+                */
+               if (!cte->irq)
+                       break;
+
+               __vgic_put_lpi_locked(kvm, cte->irq);
+               cte->irq = NULL;
+       }
+
+       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+}
+
+int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
+                        u32 devid, u32 eventid, struct vgic_irq **irq)
+{
+       struct kvm_vcpu *vcpu;
+       struct its_ite *ite;
+
+       if (!its->enabled)
+               return -EBUSY;
+
+       ite = find_ite(its, devid, eventid);
+       if (!ite || !its_is_collection_mapped(ite->collection))
+               return E_ITS_INT_UNMAPPED_INTERRUPT;
+
+       vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
+       if (!vcpu)
+               return E_ITS_INT_UNMAPPED_INTERRUPT;
+
+       if (!vcpu->arch.vgic_cpu.lpis_enabled)
+               return -EBUSY;
+
+       vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);
+
+       *irq = ite->irq;
+       return 0;
+}
+
+struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi)
+{
+       u64 address;
+       struct kvm_io_device *kvm_io_dev;
+       struct vgic_io_device *iodev;
+
+       if (!vgic_has_its(kvm))
+               return ERR_PTR(-ENODEV);
+
+       if (!(msi->flags & KVM_MSI_VALID_DEVID))
+               return ERR_PTR(-EINVAL);
+
+       address = (u64)msi->address_hi << 32 | msi->address_lo;
+
+       kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
+       if (!kvm_io_dev)
+               return ERR_PTR(-EINVAL);
+
+       if (kvm_io_dev->ops != &kvm_io_gic_ops)
+               return ERR_PTR(-EINVAL);
+
+       iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
+       if (iodev->iodev_type != IODEV_ITS)
+               return ERR_PTR(-EINVAL);
+
+       return iodev->its;
+}
+
+/*
+ * Find the target VCPU and the LPI number for a given devid/eventid pair
+ * and make this IRQ pending, possibly injecting it.
+ * Must be called with the its_lock mutex held.
+ * Returns 0 on success, a positive error value for any ITS mapping
+ * related errors and negative error values for generic errors.
+ */
+static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
+                               u32 devid, u32 eventid)
+{
+       struct vgic_irq *irq = NULL;
+       unsigned long flags;
+       int err;
+
+       err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq);
+       if (err)
+               return err;
+
+       if (irq->hw)
+               return irq_set_irqchip_state(irq->host_irq,
+                                            IRQCHIP_STATE_PENDING, true);
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       irq->pending_latch = true;
+       vgic_queue_irq_unlock(kvm, irq, flags);
+
+       return 0;
+}
+
+int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi)
+{
+       struct vgic_irq *irq;
+       unsigned long flags;
+       phys_addr_t db;
+
+       db = (u64)msi->address_hi << 32 | msi->address_lo;
+       irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data);
+
+       if (!irq)
+               return -1;
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       irq->pending_latch = true;
+       vgic_queue_irq_unlock(kvm, irq, flags);
+
+       return 0;
+}
+
+/*
+ * Queries the KVM IO bus framework to get the ITS pointer from the given
+ * doorbell address.
+ * We then call vgic_its_trigger_msi() with the decoded data.
+ * According to the KVM_SIGNAL_MSI API description returns 1 on success.
+ */
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+       struct vgic_its *its;
+       int ret;
+
+       if (!vgic_its_inject_cached_translation(kvm, msi))
+               return 1;
+
+       its = vgic_msi_to_its(kvm, msi);
+       if (IS_ERR(its))
+               return PTR_ERR(its);
+
+       mutex_lock(&its->its_lock);
+       ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data);
+       mutex_unlock(&its->its_lock);
+
+       if (ret < 0)
+               return ret;
+
+       /*
+        * KVM_SIGNAL_MSI demands a return value > 0 for success and 0
+        * if the guest has blocked the MSI. So we map any LPI mapping
+        * related error to that.
+        */
+       if (ret)
+               return 0;
+       else
+               return 1;
+}
+
+/* Requires the its_lock to be held. */
+static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
+{
+       list_del(&ite->ite_list);
+
+       /* This put matches the get in vgic_add_lpi. */
+       if (ite->irq) {
+               if (ite->irq->hw)
+                       WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
+
+               vgic_put_irq(kvm, ite->irq);
+       }
+
+       kfree(ite);
+}
+
+static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
+{
+       return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
+}
+
+#define its_cmd_get_command(cmd)       its_cmd_mask_field(cmd, 0,  0,  8)
+#define its_cmd_get_deviceid(cmd)      its_cmd_mask_field(cmd, 0, 32, 32)
+#define its_cmd_get_size(cmd)          (its_cmd_mask_field(cmd, 1,  0,  5) + 1)
+#define its_cmd_get_id(cmd)            its_cmd_mask_field(cmd, 1,  0, 32)
+#define its_cmd_get_physical_id(cmd)   its_cmd_mask_field(cmd, 1, 32, 32)
+#define its_cmd_get_collection(cmd)    its_cmd_mask_field(cmd, 2,  0, 16)
+#define its_cmd_get_ittaddr(cmd)       (its_cmd_mask_field(cmd, 2,  8, 44) << 8)
+#define its_cmd_get_target_addr(cmd)   its_cmd_mask_field(cmd, 2, 16, 32)
+#define its_cmd_get_validbit(cmd)      its_cmd_mask_field(cmd, 2, 63,  1)
+
+/*
+ * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
+                                      u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       struct its_ite *ite;
+
+       ite = find_ite(its, device_id, event_id);
+       if (ite && its_is_collection_mapped(ite->collection)) {
+               /*
+                * Though the spec talks about removing the pending state, we
+                * don't bother here since we clear the ITTE anyway and the
+                * pending state is a property of the ITTE struct.
+                */
+               vgic_its_invalidate_cache(kvm);
+
+               its_free_ite(kvm, ite);
+               return 0;
+       }
+
+       return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
+}
+
+/*
+ * The MOVI command moves an ITTE to a different collection.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       u32 coll_id = its_cmd_get_collection(its_cmd);
+       struct kvm_vcpu *vcpu;
+       struct its_ite *ite;
+       struct its_collection *collection;
+
+       ite = find_ite(its, device_id, event_id);
+       if (!ite)
+               return E_ITS_MOVI_UNMAPPED_INTERRUPT;
+
+       if (!its_is_collection_mapped(ite->collection))
+               return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+       collection = find_collection(its, coll_id);
+       if (!its_is_collection_mapped(collection))
+               return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+       ite->collection = collection;
+       vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+       vgic_its_invalidate_cache(kvm);
+
+       return update_affinity(ite->irq, vcpu);
+}
+
+/*
+ * Check whether an ID can be stored into the corresponding guest table.
+ * For a direct table this is pretty easy, but gets a bit nasty for
+ * indirect tables. We check whether the resulting guest physical address
+ * is actually valid (covered by a memslot and guest accessible).
+ * For this we have to read the respective first level entry.
+ */
+static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
+                             gpa_t *eaddr)
+{
+       int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+       u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
+       phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
+       int esz = GITS_BASER_ENTRY_SIZE(baser);
+       int index, idx;
+       gfn_t gfn;
+       bool ret;
+
+       switch (type) {
+       case GITS_BASER_TYPE_DEVICE:
+               if (id >= BIT_ULL(VITS_TYPER_DEVBITS))
+                       return false;
+               break;
+       case GITS_BASER_TYPE_COLLECTION:
+               /* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */
+               if (id >= BIT_ULL(16))
+                       return false;
+               break;
+       default:
+               return false;
+       }
+
+       if (!(baser & GITS_BASER_INDIRECT)) {
+               phys_addr_t addr;
+
+               if (id >= (l1_tbl_size / esz))
+                       return false;
+
+               addr = base + id * esz;
+               gfn = addr >> PAGE_SHIFT;
+
+               if (eaddr)
+                       *eaddr = addr;
+
+               goto out;
+       }
+
+       /* calculate and check the index into the 1st level */
+       index = id / (SZ_64K / esz);
+       if (index >= (l1_tbl_size / sizeof(u64)))
+               return false;
+
+       /* Each 1st level entry is represented by a 64-bit value. */
+       if (kvm_read_guest_lock(its->dev->kvm,
+                          base + index * sizeof(indirect_ptr),
+                          &indirect_ptr, sizeof(indirect_ptr)))
+               return false;
+
+       indirect_ptr = le64_to_cpu(indirect_ptr);
+
+       /* check the valid bit of the first level entry */
+       if (!(indirect_ptr & BIT_ULL(63)))
+               return false;
+
+       /* Mask the guest physical address and calculate the frame number. */
+       indirect_ptr &= GENMASK_ULL(51, 16);
+
+       /* Find the address of the actual entry */
+       index = id % (SZ_64K / esz);
+       indirect_ptr += index * esz;
+       gfn = indirect_ptr >> PAGE_SHIFT;
+
+       if (eaddr)
+               *eaddr = indirect_ptr;
+
+out:
+       idx = srcu_read_lock(&its->dev->kvm->srcu);
+       ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
+       srcu_read_unlock(&its->dev->kvm->srcu, idx);
+       return ret;
+}
+
+static int vgic_its_alloc_collection(struct vgic_its *its,
+                                    struct its_collection **colp,
+                                    u32 coll_id)
+{
+       struct its_collection *collection;
+
+       if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL))
+               return E_ITS_MAPC_COLLECTION_OOR;
+
+       collection = kzalloc(sizeof(*collection), GFP_KERNEL);
+       if (!collection)
+               return -ENOMEM;
+
+       collection->collection_id = coll_id;
+       collection->target_addr = COLLECTION_NOT_MAPPED;
+
+       list_add_tail(&collection->coll_list, &its->collection_list);
+       *colp = collection;
+
+       return 0;
+}
+
+static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
+{
+       struct its_collection *collection;
+       struct its_device *device;
+       struct its_ite *ite;
+
+       /*
+        * Clearing the mapping for that collection ID removes the
+        * entry from the list. If there wasn't any before, we can
+        * go home early.
+        */
+       collection = find_collection(its, coll_id);
+       if (!collection)
+               return;
+
+       for_each_lpi_its(device, ite, its)
+               if (ite->collection &&
+                   ite->collection->collection_id == coll_id)
+                       ite->collection = NULL;
+
+       list_del(&collection->coll_list);
+       kfree(collection);
+}
+
+/* Must be called with its_lock mutex held */
+static struct its_ite *vgic_its_alloc_ite(struct its_device *device,
+                                         struct its_collection *collection,
+                                         u32 event_id)
+{
+       struct its_ite *ite;
+
+       ite = kzalloc(sizeof(*ite), GFP_KERNEL);
+       if (!ite)
+               return ERR_PTR(-ENOMEM);
+
+       ite->event_id   = event_id;
+       ite->collection = collection;
+
+       list_add_tail(&ite->ite_list, &device->itt_head);
+       return ite;
+}
+
+/*
+ * The MAPTI and MAPI commands map LPIs to ITTEs.
+ * Must be called with its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       u32 coll_id = its_cmd_get_collection(its_cmd);
+       struct its_ite *ite;
+       struct kvm_vcpu *vcpu = NULL;
+       struct its_device *device;
+       struct its_collection *collection, *new_coll = NULL;
+       struct vgic_irq *irq;
+       int lpi_nr;
+
+       device = find_its_device(its, device_id);
+       if (!device)
+               return E_ITS_MAPTI_UNMAPPED_DEVICE;
+
+       if (event_id >= BIT_ULL(device->num_eventid_bits))
+               return E_ITS_MAPTI_ID_OOR;
+
+       if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
+               lpi_nr = its_cmd_get_physical_id(its_cmd);
+       else
+               lpi_nr = event_id;
+       if (lpi_nr < GIC_LPI_OFFSET ||
+           lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
+               return E_ITS_MAPTI_PHYSICALID_OOR;
+
+       /* If there is an existing mapping, behavior is UNPREDICTABLE. */
+       if (find_ite(its, device_id, event_id))
+               return 0;
+
+       collection = find_collection(its, coll_id);
+       if (!collection) {
+               int ret = vgic_its_alloc_collection(its, &collection, coll_id);
+               if (ret)
+                       return ret;
+               new_coll = collection;
+       }
+
+       ite = vgic_its_alloc_ite(device, collection, event_id);
+       if (IS_ERR(ite)) {
+               if (new_coll)
+                       vgic_its_free_collection(its, coll_id);
+               return PTR_ERR(ite);
+       }
+
+       if (its_is_collection_mapped(collection))
+               vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+       irq = vgic_add_lpi(kvm, lpi_nr, vcpu);
+       if (IS_ERR(irq)) {
+               if (new_coll)
+                       vgic_its_free_collection(its, coll_id);
+               its_free_ite(kvm, ite);
+               return PTR_ERR(irq);
+       }
+       ite->irq = irq;
+
+       return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
+{
+       struct its_ite *ite, *temp;
+
+       /*
+        * The spec says that unmapping a device with still valid
+        * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
+        * since we cannot leave the memory unreferenced.
+        */
+       list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list)
+               its_free_ite(kvm, ite);
+
+       vgic_its_invalidate_cache(kvm);
+
+       list_del(&device->dev_list);
+       kfree(device);
+}
+
+/* its lock must be held */
+static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
+{
+       struct its_device *cur, *temp;
+
+       list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
+               vgic_its_free_device(kvm, cur);
+}
+
+/* its lock must be held */
+static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its)
+{
+       struct its_collection *cur, *temp;
+
+       list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list)
+               vgic_its_free_collection(its, cur->collection_id);
+}
+
+/* Must be called with its_lock mutex held */
+static struct its_device *vgic_its_alloc_device(struct vgic_its *its,
+                                               u32 device_id, gpa_t itt_addr,
+                                               u8 num_eventid_bits)
+{
+       struct its_device *device;
+
+       device = kzalloc(sizeof(*device), GFP_KERNEL);
+       if (!device)
+               return ERR_PTR(-ENOMEM);
+
+       device->device_id = device_id;
+       device->itt_addr = itt_addr;
+       device->num_eventid_bits = num_eventid_bits;
+       INIT_LIST_HEAD(&device->itt_head);
+
+       list_add_tail(&device->dev_list, &its->device_list);
+       return device;
+}
+
+/*
+ * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       bool valid = its_cmd_get_validbit(its_cmd);
+       u8 num_eventid_bits = its_cmd_get_size(its_cmd);
+       gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd);
+       struct its_device *device;
+
+       if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL))
+               return E_ITS_MAPD_DEVICE_OOR;
+
+       if (valid && num_eventid_bits > VITS_TYPER_IDBITS)
+               return E_ITS_MAPD_ITTSIZE_OOR;
+
+       device = find_its_device(its, device_id);
+
+       /*
+        * The spec says that calling MAPD on an already mapped device
+        * invalidates all cached data for this device. We implement this
+        * by removing the mapping and re-establishing it.
+        */
+       if (device)
+               vgic_its_free_device(kvm, device);
+
+       /*
+        * The spec does not say whether unmapping a not-mapped device
+        * is an error, so we are done in any case.
+        */
+       if (!valid)
+               return 0;
+
+       device = vgic_its_alloc_device(its, device_id, itt_addr,
+                                      num_eventid_bits);
+
+       return PTR_ERR_OR_ZERO(device);
+}
+
+/*
+ * The MAPC command maps collection IDs to redistributors.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u16 coll_id;
+       u32 target_addr;
+       struct its_collection *collection;
+       bool valid;
+
+       valid = its_cmd_get_validbit(its_cmd);
+       coll_id = its_cmd_get_collection(its_cmd);
+       target_addr = its_cmd_get_target_addr(its_cmd);
+
+       if (target_addr >= atomic_read(&kvm->online_vcpus))
+               return E_ITS_MAPC_PROCNUM_OOR;
+
+       if (!valid) {
+               vgic_its_free_collection(its, coll_id);
+               vgic_its_invalidate_cache(kvm);
+       } else {
+               collection = find_collection(its, coll_id);
+
+               if (!collection) {
+                       int ret;
+
+                       ret = vgic_its_alloc_collection(its, &collection,
+                                                       coll_id);
+                       if (ret)
+                               return ret;
+                       collection->target_addr = target_addr;
+               } else {
+                       collection->target_addr = target_addr;
+                       update_affinity_collection(kvm, its, collection);
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * The CLEAR command removes the pending state for a particular LPI.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
+                                    u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       struct its_ite *ite;
+
+
+       ite = find_ite(its, device_id, event_id);
+       if (!ite)
+               return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
+
+       ite->irq->pending_latch = false;
+
+       if (ite->irq->hw)
+               return irq_set_irqchip_state(ite->irq->host_irq,
+                                            IRQCHIP_STATE_PENDING, false);
+
+       return 0;
+}
+
+/*
+ * The INV command syncs the configuration bits from the memory table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
+                                  u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       struct its_ite *ite;
+
+
+       ite = find_ite(its, device_id, event_id);
+       if (!ite)
+               return E_ITS_INV_UNMAPPED_INTERRUPT;
+
+       return update_lpi_config(kvm, ite->irq, NULL, true);
+}
+
+/*
+ * The INVALL command requests flushing of all IRQ data in this collection.
+ * Find the VCPU mapped to that collection, then iterate over the VM's list
+ * of mapped LPIs and update the configuration for each IRQ which targets
+ * the specified vcpu. The configuration will be read from the in-memory
+ * configuration table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
+                                     u64 *its_cmd)
+{
+       u32 coll_id = its_cmd_get_collection(its_cmd);
+       struct its_collection *collection;
+       struct kvm_vcpu *vcpu;
+       struct vgic_irq *irq;
+       u32 *intids;
+       int irq_count, i;
+
+       collection = find_collection(its, coll_id);
+       if (!its_is_collection_mapped(collection))
+               return E_ITS_INVALL_UNMAPPED_COLLECTION;
+
+       vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+       irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids);
+       if (irq_count < 0)
+               return irq_count;
+
+       for (i = 0; i < irq_count; i++) {
+               irq = vgic_get_irq(kvm, NULL, intids[i]);
+               if (!irq)
+                       continue;
+               update_lpi_config(kvm, irq, vcpu, false);
+               vgic_put_irq(kvm, irq);
+       }
+
+       kfree(intids);
+
+       if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm)
+               its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe);
+
+       return 0;
+}
+
+/*
+ * The MOVALL command moves the pending state of all IRQs targeting one
+ * redistributor to another. We don't hold the pending state in the VCPUs,
+ * but in the IRQs instead, so there is really not much to do for us here.
+ * However the spec says that no IRQ must target the old redistributor
+ * afterwards, so we make sure that no LPI is using the associated target_vcpu.
+ * This command affects all LPIs in the system that target that redistributor.
+ */
+static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
+                                     u64 *its_cmd)
+{
+       u32 target1_addr = its_cmd_get_target_addr(its_cmd);
+       u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
+       struct kvm_vcpu *vcpu1, *vcpu2;
+       struct vgic_irq *irq;
+       u32 *intids;
+       int irq_count, i;
+
+       if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
+           target2_addr >= atomic_read(&kvm->online_vcpus))
+               return E_ITS_MOVALL_PROCNUM_OOR;
+
+       if (target1_addr == target2_addr)
+               return 0;
+
+       vcpu1 = kvm_get_vcpu(kvm, target1_addr);
+       vcpu2 = kvm_get_vcpu(kvm, target2_addr);
+
+       irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids);
+       if (irq_count < 0)
+               return irq_count;
+
+       for (i = 0; i < irq_count; i++) {
+               irq = vgic_get_irq(kvm, NULL, intids[i]);
+
+               update_affinity(irq, vcpu2);
+
+               vgic_put_irq(kvm, irq);
+       }
+
+       vgic_its_invalidate_cache(kvm);
+
+       kfree(intids);
+       return 0;
+}
+
+/*
+ * The INT command injects the LPI associated with that DevID/EvID pair.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
+                                  u64 *its_cmd)
+{
+       u32 msi_data = its_cmd_get_id(its_cmd);
+       u64 msi_devid = its_cmd_get_deviceid(its_cmd);
+
+       return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
+}
+
+/*
+ * This function is called with the its_cmd lock held, but the ITS data
+ * structure lock dropped.
+ */
+static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
+                                  u64 *its_cmd)
+{
+       int ret = -ENODEV;
+
+       mutex_lock(&its->its_lock);
+       switch (its_cmd_get_command(its_cmd)) {
+       case GITS_CMD_MAPD:
+               ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MAPC:
+               ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MAPI:
+               ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MAPTI:
+               ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MOVI:
+               ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_DISCARD:
+               ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_CLEAR:
+               ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MOVALL:
+               ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_INT:
+               ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_INV:
+               ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_INVALL:
+               ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_SYNC:
+               /* we ignore this command: we are in sync all of the time */
+               ret = 0;
+               break;
+       }
+       mutex_unlock(&its->its_lock);
+
+       return ret;
+}
+
+static u64 vgic_sanitise_its_baser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
+                                 GITS_BASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
+                                 GITS_BASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
+                                 GITS_BASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       /* We support only one (ITS) page size: 64K */
+       reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
+
+       return reg;
+}
+
+static u64 vgic_sanitise_its_cbaser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
+                                 GITS_CBASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
+                                 GITS_CBASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
+                                 GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       /* Sanitise the physical address to be 64k aligned. */
+       reg &= ~GENMASK_ULL(15, 12);
+
+       return reg;
+}
+
+static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
+                                              struct vgic_its *its,
+                                              gpa_t addr, unsigned int len)
+{
+       return extract_bytes(its->cbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
+                                      gpa_t addr, unsigned int len,
+                                      unsigned long val)
+{
+       /* When GITS_CTLR.Enable is 1, this register is RO. */
+       if (its->enabled)
+               return;
+
+       mutex_lock(&its->cmd_lock);
+       its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
+       its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
+       its->creadr = 0;
+       /*
+        * CWRITER is architecturally UNKNOWN on reset, but we need to reset
+        * it to CREADR to make sure we start with an empty command buffer.
+        */
+       its->cwriter = its->creadr;
+       mutex_unlock(&its->cmd_lock);
+}
+
+#define ITS_CMD_BUFFER_SIZE(baser)     ((((baser) & 0xff) + 1) << 12)
+#define ITS_CMD_SIZE                   32
+#define ITS_CMD_OFFSET(reg)            ((reg) & GENMASK(19, 5))
+
+/* Must be called with the cmd_lock held. */
+static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its)
+{
+       gpa_t cbaser;
+       u64 cmd_buf[4];
+
+       /* Commands are only processed when the ITS is enabled. */
+       if (!its->enabled)
+               return;
+
+       cbaser = GITS_CBASER_ADDRESS(its->cbaser);
+
+       while (its->cwriter != its->creadr) {
+               int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr,
+                                             cmd_buf, ITS_CMD_SIZE);
+               /*
+                * If kvm_read_guest() fails, this could be due to the guest
+                * programming a bogus value in CBASER or something else going
+                * wrong from which we cannot easily recover.
+                * According to section 6.3.2 in the GICv3 spec we can just
+                * ignore that command then.
+                */
+               if (!ret)
+                       vgic_its_handle_command(kvm, its, cmd_buf);
+
+               its->creadr += ITS_CMD_SIZE;
+               if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
+                       its->creadr = 0;
+       }
+}
+
+/*
+ * By writing to CWRITER the guest announces new commands to be processed.
+ * To avoid any races in the first place, we take the its_cmd lock, which
+ * protects our ring buffer variables, so that there is only one user
+ * per ITS handling commands at a given time.
+ */
+static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
+                                       gpa_t addr, unsigned int len,
+                                       unsigned long val)
+{
+       u64 reg;
+
+       if (!its)
+               return;
+
+       mutex_lock(&its->cmd_lock);
+
+       reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
+       reg = ITS_CMD_OFFSET(reg);
+       if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
+               mutex_unlock(&its->cmd_lock);
+               return;
+       }
+       its->cwriter = reg;
+
+       vgic_its_process_commands(kvm, its);
+
+       mutex_unlock(&its->cmd_lock);
+}
+
+static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
+                                               struct vgic_its *its,
+                                               gpa_t addr, unsigned int len)
+{
+       return extract_bytes(its->cwriter, addr & 0x7, len);
+}
+
+static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
+                                              struct vgic_its *its,
+                                              gpa_t addr, unsigned int len)
+{
+       return extract_bytes(its->creadr, addr & 0x7, len);
+}
+
+static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm,
+                                             struct vgic_its *its,
+                                             gpa_t addr, unsigned int len,
+                                             unsigned long val)
+{
+       u32 cmd_offset;
+       int ret = 0;
+
+       mutex_lock(&its->cmd_lock);
+
+       if (its->enabled) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       cmd_offset = ITS_CMD_OFFSET(val);
+       if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       its->creadr = cmd_offset;
+out:
+       mutex_unlock(&its->cmd_lock);
+       return ret;
+}
+
+#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
+static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
+                                             struct vgic_its *its,
+                                             gpa_t addr, unsigned int len)
+{
+       u64 reg;
+
+       switch (BASER_INDEX(addr)) {
+       case 0:
+               reg = its->baser_device_table;
+               break;
+       case 1:
+               reg = its->baser_coll_table;
+               break;
+       default:
+               reg = 0;
+               break;
+       }
+
+       return extract_bytes(reg, addr & 7, len);
+}
+
+#define GITS_BASER_RO_MASK     (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
+static void vgic_mmio_write_its_baser(struct kvm *kvm,
+                                     struct vgic_its *its,
+                                     gpa_t addr, unsigned int len,
+                                     unsigned long val)
+{
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       u64 entry_size, table_type;
+       u64 reg, *regptr, clearbits = 0;
+
+       /* When GITS_CTLR.Enable is 1, we ignore write accesses. */
+       if (its->enabled)
+               return;
+
+       switch (BASER_INDEX(addr)) {
+       case 0:
+               regptr = &its->baser_device_table;
+               entry_size = abi->dte_esz;
+               table_type = GITS_BASER_TYPE_DEVICE;
+               break;
+       case 1:
+               regptr = &its->baser_coll_table;
+               entry_size = abi->cte_esz;
+               table_type = GITS_BASER_TYPE_COLLECTION;
+               clearbits = GITS_BASER_INDIRECT;
+               break;
+       default:
+               return;
+       }
+
+       reg = update_64bit_reg(*regptr, addr & 7, len, val);
+       reg &= ~GITS_BASER_RO_MASK;
+       reg &= ~clearbits;
+
+       reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
+       reg |= table_type << GITS_BASER_TYPE_SHIFT;
+       reg = vgic_sanitise_its_baser(reg);
+
+       *regptr = reg;
+
+       if (!(reg & GITS_BASER_VALID)) {
+               /* Take the its_lock to prevent a race with a save/restore */
+               mutex_lock(&its->its_lock);
+               switch (table_type) {
+               case GITS_BASER_TYPE_DEVICE:
+                       vgic_its_free_device_list(kvm, its);
+                       break;
+               case GITS_BASER_TYPE_COLLECTION:
+                       vgic_its_free_collection_list(kvm, its);
+                       break;
+               }
+               mutex_unlock(&its->its_lock);
+       }
+}
+
+static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
+                                            struct vgic_its *its,
+                                            gpa_t addr, unsigned int len)
+{
+       u32 reg = 0;
+
+       mutex_lock(&its->cmd_lock);
+       if (its->creadr == its->cwriter)
+               reg |= GITS_CTLR_QUIESCENT;
+       if (its->enabled)
+               reg |= GITS_CTLR_ENABLE;
+       mutex_unlock(&its->cmd_lock);
+
+       return reg;
+}
+
+static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       mutex_lock(&its->cmd_lock);
+
+       /*
+        * It is UNPREDICTABLE to enable the ITS if any of the CBASER or
+        * device/collection BASER are invalid
+        */
+       if (!its->enabled && (val & GITS_CTLR_ENABLE) &&
+               (!(its->baser_device_table & GITS_BASER_VALID) ||
+                !(its->baser_coll_table & GITS_BASER_VALID) ||
+                !(its->cbaser & GITS_CBASER_VALID)))
+               goto out;
+
+       its->enabled = !!(val & GITS_CTLR_ENABLE);
+       if (!its->enabled)
+               vgic_its_invalidate_cache(kvm);
+
+       /*
+        * Try to process any pending commands. This function bails out early
+        * if the ITS is disabled or no commands have been queued.
+        */
+       vgic_its_process_commands(kvm, its);
+
+out:
+       mutex_unlock(&its->cmd_lock);
+}
+
+#define REGISTER_ITS_DESC(off, rd, wr, length, acc)            \
+{                                                              \
+       .reg_offset = off,                                      \
+       .len = length,                                          \
+       .access_flags = acc,                                    \
+       .its_read = rd,                                         \
+       .its_write = wr,                                        \
+}
+
+#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\
+{                                                              \
+       .reg_offset = off,                                      \
+       .len = length,                                          \
+       .access_flags = acc,                                    \
+       .its_read = rd,                                         \
+       .its_write = wr,                                        \
+       .uaccess_its_write = uwr,                               \
+}
+
+static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
+                             gpa_t addr, unsigned int len, unsigned long val)
+{
+       /* Ignore */
+}
+
+static struct vgic_register_region its_registers[] = {
+       REGISTER_ITS_DESC(GITS_CTLR,
+               vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC_UACCESS(GITS_IIDR,
+               vgic_mmio_read_its_iidr, its_mmio_write_wi,
+               vgic_mmio_uaccess_write_its_iidr, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_TYPER,
+               vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_CBASER,
+               vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_CWRITER,
+               vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC_UACCESS(GITS_CREADR,
+               vgic_mmio_read_its_creadr, its_mmio_write_wi,
+               vgic_mmio_uaccess_write_its_creadr, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_BASER,
+               vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_IDREGS_BASE,
+               vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
+               VGIC_ACCESS_32bit),
+};
+
+/* This is called on setting the LPI enable bit in the redistributor. */
+void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+       if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
+               its_sync_lpi_pending_table(vcpu);
+}
+
+static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its,
+                                  u64 addr)
+{
+       struct vgic_io_device *iodev = &its->iodev;
+       int ret;
+
+       mutex_lock(&kvm->slots_lock);
+       if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       its->vgic_its_base = addr;
+       iodev->regions = its_registers;
+       iodev->nr_regions = ARRAY_SIZE(its_registers);
+       kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);
+
+       iodev->base_addr = its->vgic_its_base;
+       iodev->iodev_type = IODEV_ITS;
+       iodev->its = its;
+       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
+                                     KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
+out:
+       mutex_unlock(&kvm->slots_lock);
+
+       return ret;
+}
+
+/* Default is 16 cached LPIs per vcpu */
+#define LPI_DEFAULT_PCPU_CACHE_SIZE    16
+
+void vgic_lpi_translation_cache_init(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       unsigned int sz;
+       int i;
+
+       if (!list_empty(&dist->lpi_translation_cache))
+               return;
+
+       sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE;
+
+       for (i = 0; i < sz; i++) {
+               struct vgic_translation_cache_entry *cte;
+
+               /* An allocation failure is not fatal */
+               cte = kzalloc(sizeof(*cte), GFP_KERNEL);
+               if (WARN_ON(!cte))
+                       break;
+
+               INIT_LIST_HEAD(&cte->entry);
+               list_add(&cte->entry, &dist->lpi_translation_cache);
+       }
+}
+
+void vgic_lpi_translation_cache_destroy(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_translation_cache_entry *cte, *tmp;
+
+       vgic_its_invalidate_cache(kvm);
+
+       list_for_each_entry_safe(cte, tmp,
+                                &dist->lpi_translation_cache, entry) {
+               list_del(&cte->entry);
+               kfree(cte);
+       }
+}
+
+#define INITIAL_BASER_VALUE                                              \
+       (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)                | \
+        GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)         | \
+        GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)             | \
+        GITS_BASER_PAGE_SIZE_64K)
+
+#define INITIAL_PROPBASER_VALUE                                                  \
+       (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb)            | \
+        GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner)     | \
+        GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))
+
+static int vgic_its_create(struct kvm_device *dev, u32 type)
+{
+       struct vgic_its *its;
+
+       if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
+               return -ENODEV;
+
+       its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
+       if (!its)
+               return -ENOMEM;
+
+       if (vgic_initialized(dev->kvm)) {
+               int ret = vgic_v4_init(dev->kvm);
+               if (ret < 0) {
+                       kfree(its);
+                       return ret;
+               }
+
+               vgic_lpi_translation_cache_init(dev->kvm);
+       }
+
+       mutex_init(&its->its_lock);
+       mutex_init(&its->cmd_lock);
+
+       its->vgic_its_base = VGIC_ADDR_UNDEF;
+
+       INIT_LIST_HEAD(&its->device_list);
+       INIT_LIST_HEAD(&its->collection_list);
+
+       dev->kvm->arch.vgic.msis_require_devid = true;
+       dev->kvm->arch.vgic.has_its = true;
+       its->enabled = false;
+       its->dev = dev;
+
+       its->baser_device_table = INITIAL_BASER_VALUE                   |
+               ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
+       its->baser_coll_table = INITIAL_BASER_VALUE |
+               ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
+       dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;
+
+       dev->private = its;
+
+       return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
+}
+
+static void vgic_its_destroy(struct kvm_device *kvm_dev)
+{
+       struct kvm *kvm = kvm_dev->kvm;
+       struct vgic_its *its = kvm_dev->private;
+
+       mutex_lock(&its->its_lock);
+
+       vgic_its_free_device_list(kvm, its);
+       vgic_its_free_collection_list(kvm, its);
+
+       mutex_unlock(&its->its_lock);
+       kfree(its);
+       kfree(kvm_dev);/* alloc by kvm_ioctl_create_device, free by .destroy */
+}
+
+static int vgic_its_has_attr_regs(struct kvm_device *dev,
+                                 struct kvm_device_attr *attr)
+{
+       const struct vgic_register_region *region;
+       gpa_t offset = attr->attr;
+       int align;
+
+       align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7;
+
+       if (offset & align)
+               return -EINVAL;
+
+       region = vgic_find_mmio_region(its_registers,
+                                      ARRAY_SIZE(its_registers),
+                                      offset);
+       if (!region)
+               return -ENXIO;
+
+       return 0;
+}
+
+static int vgic_its_attr_regs_access(struct kvm_device *dev,
+                                    struct kvm_device_attr *attr,
+                                    u64 *reg, bool is_write)
+{
+       const struct vgic_register_region *region;
+       struct vgic_its *its;
+       gpa_t addr, offset;
+       unsigned int len;
+       int align, ret = 0;
+
+       its = dev->private;
+       offset = attr->attr;
+
+       /*
+        * Although the spec supports upper/lower 32-bit accesses to
+        * 64-bit ITS registers, the userspace ABI requires 64-bit
+        * accesses to all 64-bit wide registers. We therefore only
+        * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID
+        * registers
+        */
+       if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4))
+               align = 0x3;
+       else
+               align = 0x7;
+
+       if (offset & align)
+               return -EINVAL;
+
+       mutex_lock(&dev->kvm->lock);
+
+       if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
+               ret = -ENXIO;
+               goto out;
+       }
+
+       region = vgic_find_mmio_region(its_registers,
+                                      ARRAY_SIZE(its_registers),
+                                      offset);
+       if (!region) {
+               ret = -ENXIO;
+               goto out;
+       }
+
+       if (!lock_all_vcpus(dev->kvm)) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       addr = its->vgic_its_base + offset;
+
+       len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4;
+
+       if (is_write) {
+               if (region->uaccess_its_write)
+                       ret = region->uaccess_its_write(dev->kvm, its, addr,
+                                                       len, *reg);
+               else
+                       region->its_write(dev->kvm, its, addr, len, *reg);
+       } else {
+               *reg = region->its_read(dev->kvm, its, addr, len);
+       }
+       unlock_all_vcpus(dev->kvm);
+out:
+       mutex_unlock(&dev->kvm->lock);
+       return ret;
+}
+
+static u32 compute_next_devid_offset(struct list_head *h,
+                                    struct its_device *dev)
+{
+       struct its_device *next;
+       u32 next_offset;
+
+       if (list_is_last(&dev->dev_list, h))
+               return 0;
+       next = list_next_entry(dev, dev_list);
+       next_offset = next->device_id - dev->device_id;
+
+       return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET);
+}
+
+static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite)
+{
+       struct its_ite *next;
+       u32 next_offset;
+
+       if (list_is_last(&ite->ite_list, h))
+               return 0;
+       next = list_next_entry(ite, ite_list);
+       next_offset = next->event_id - ite->event_id;
+
+       return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET);
+}
+
+/**
+ * entry_fn_t - Callback called on a table entry restore path
+ * @its: its handle
+ * @id: id of the entry
+ * @entry: pointer to the entry
+ * @opaque: pointer to an opaque data
+ *
+ * Return: < 0 on error, 0 if last element was identified, id offset to next
+ * element otherwise
+ */
+typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry,
+                         void *opaque);
+
+/**
+ * scan_its_table - Scan a contiguous table in guest RAM and applies a function
+ * to each entry
+ *
+ * @its: its handle
+ * @base: base gpa of the table
+ * @size: size of the table in bytes
+ * @esz: entry size in bytes
+ * @start_id: the ID of the first entry in the table
+ * (non zero for 2d level tables)
+ * @fn: function to apply on each entry
+ *
+ * Return: < 0 on error, 0 if last element was identified, 1 otherwise
+ * (the last element may not be found on second level tables)
+ */
+static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz,
+                         int start_id, entry_fn_t fn, void *opaque)
+{
+       struct kvm *kvm = its->dev->kvm;
+       unsigned long len = size;
+       int id = start_id;
+       gpa_t gpa = base;
+       char entry[ESZ_MAX];
+       int ret;
+
+       memset(entry, 0, esz);
+
+       while (len > 0) {
+               int next_offset;
+               size_t byte_offset;
+
+               ret = kvm_read_guest_lock(kvm, gpa, entry, esz);
+               if (ret)
+                       return ret;
+
+               next_offset = fn(its, id, entry, opaque);
+               if (next_offset <= 0)
+                       return next_offset;
+
+               byte_offset = next_offset * esz;
+               id += next_offset;
+               gpa += byte_offset;
+               len -= byte_offset;
+       }
+       return 1;
+}
+
+/**
+ * vgic_its_save_ite - Save an interrupt translation entry at @gpa
+ */
+static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
+                             struct its_ite *ite, gpa_t gpa, int ite_esz)
+{
+       struct kvm *kvm = its->dev->kvm;
+       u32 next_offset;
+       u64 val;
+
+       next_offset = compute_next_eventid_offset(&dev->itt_head, ite);
+       val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) |
+              ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
+               ite->collection->collection_id;
+       val = cpu_to_le64(val);
+       return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
+}
+
+/**
+ * vgic_its_restore_ite - restore an interrupt translation entry
+ * @event_id: id used for indexing
+ * @ptr: pointer to the ITE entry
+ * @opaque: pointer to the its_device
+ */
+static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
+                               void *ptr, void *opaque)
+{
+       struct its_device *dev = (struct its_device *)opaque;
+       struct its_collection *collection;
+       struct kvm *kvm = its->dev->kvm;
+       struct kvm_vcpu *vcpu = NULL;
+       u64 val;
+       u64 *p = (u64 *)ptr;
+       struct vgic_irq *irq;
+       u32 coll_id, lpi_id;
+       struct its_ite *ite;
+       u32 offset;
+
+       val = *p;
+
+       val = le64_to_cpu(val);
+
+       coll_id = val & KVM_ITS_ITE_ICID_MASK;
+       lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT;
+
+       if (!lpi_id)
+               return 1; /* invalid entry, no choice but to scan next entry */
+
+       if (lpi_id < VGIC_MIN_LPI)
+               return -EINVAL;
+
+       offset = val >> KVM_ITS_ITE_NEXT_SHIFT;
+       if (event_id + offset >= BIT_ULL(dev->num_eventid_bits))
+               return -EINVAL;
+
+       collection = find_collection(its, coll_id);
+       if (!collection)
+               return -EINVAL;
+
+       ite = vgic_its_alloc_ite(dev, collection, event_id);
+       if (IS_ERR(ite))
+               return PTR_ERR(ite);
+
+       if (its_is_collection_mapped(collection))
+               vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+       irq = vgic_add_lpi(kvm, lpi_id, vcpu);
+       if (IS_ERR(irq))
+               return PTR_ERR(irq);
+       ite->irq = irq;
+
+       return offset;
+}
+
+static int vgic_its_ite_cmp(void *priv, struct list_head *a,
+                           struct list_head *b)
+{
+       struct its_ite *itea = container_of(a, struct its_ite, ite_list);
+       struct its_ite *iteb = container_of(b, struct its_ite, ite_list);
+
+       if (itea->event_id < iteb->event_id)
+               return -1;
+       else
+               return 1;
+}
+
+static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
+{
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       gpa_t base = device->itt_addr;
+       struct its_ite *ite;
+       int ret;
+       int ite_esz = abi->ite_esz;
+
+       list_sort(NULL, &device->itt_head, vgic_its_ite_cmp);
+
+       list_for_each_entry(ite, &device->itt_head, ite_list) {
+               gpa_t gpa = base + ite->event_id * ite_esz;
+
+               /*
+                * If an LPI carries the HW bit, this means that this
+                * interrupt is controlled by GICv4, and we do not
+                * have direct access to that state. Let's simply fail
+                * the save operation...
+                */
+               if (ite->irq->hw)
+                       return -EACCES;
+
+               ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+/**
+ * vgic_its_restore_itt - restore the ITT of a device
+ *
+ * @its: its handle
+ * @dev: device handle
+ *
+ * Return 0 on success, < 0 on error
+ */
+static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev)
+{
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       gpa_t base = dev->itt_addr;
+       int ret;
+       int ite_esz = abi->ite_esz;
+       size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz;
+
+       ret = scan_its_table(its, base, max_size, ite_esz, 0,
+                            vgic_its_restore_ite, dev);
+
+       /* scan_its_table returns +1 if all ITEs are invalid */
+       if (ret > 0)
+               ret = 0;
+
+       return ret;
+}
+
+/**
+ * vgic_its_save_dte - Save a device table entry at a given GPA
+ *
+ * @its: ITS handle
+ * @dev: ITS device
+ * @ptr: GPA
+ */
+static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
+                            gpa_t ptr, int dte_esz)
+{
+       struct kvm *kvm = its->dev->kvm;
+       u64 val, itt_addr_field;
+       u32 next_offset;
+
+       itt_addr_field = dev->itt_addr >> 8;
+       next_offset = compute_next_devid_offset(&its->device_list, dev);
+       val = (1ULL << KVM_ITS_DTE_VALID_SHIFT |
+              ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) |
+              (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
+               (dev->num_eventid_bits - 1));
+       val = cpu_to_le64(val);
+       return kvm_write_guest_lock(kvm, ptr, &val, dte_esz);
+}
+
+/**
+ * vgic_its_restore_dte - restore a device table entry
+ *
+ * @its: its handle
+ * @id: device id the DTE corresponds to
+ * @ptr: kernel VA where the 8 byte DTE is located
+ * @opaque: unused
+ *
+ * Return: < 0 on error, 0 if the dte is the last one, id offset to the
+ * next dte otherwise
+ */
+static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
+                               void *ptr, void *opaque)
+{
+       struct its_device *dev;
+       gpa_t itt_addr;
+       u8 num_eventid_bits;
+       u64 entry = *(u64 *)ptr;
+       bool valid;
+       u32 offset;
+       int ret;
+
+       entry = le64_to_cpu(entry);
+
+       valid = entry >> KVM_ITS_DTE_VALID_SHIFT;
+       num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1;
+       itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK)
+                       >> KVM_ITS_DTE_ITTADDR_SHIFT) << 8;
+
+       if (!valid)
+               return 1;
+
+       /* dte entry is valid */
+       offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT;
+
+       dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits);
+       if (IS_ERR(dev))
+               return PTR_ERR(dev);
+
+       ret = vgic_its_restore_itt(its, dev);
+       if (ret) {
+               vgic_its_free_device(its->dev->kvm, dev);
+               return ret;
+       }
+
+       return offset;
+}
+
+static int vgic_its_device_cmp(void *priv, struct list_head *a,
+                              struct list_head *b)
+{
+       struct its_device *deva = container_of(a, struct its_device, dev_list);
+       struct its_device *devb = container_of(b, struct its_device, dev_list);
+
+       if (deva->device_id < devb->device_id)
+               return -1;
+       else
+               return 1;
+}
+
+/**
+ * vgic_its_save_device_tables - Save the device table and all ITT
+ * into guest RAM
+ *
+ * L1/L2 handling is hidden by vgic_its_check_id() helper which directly
+ * returns the GPA of the device entry
+ */
+static int vgic_its_save_device_tables(struct vgic_its *its)
+{
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       u64 baser = its->baser_device_table;
+       struct its_device *dev;
+       int dte_esz = abi->dte_esz;
+
+       if (!(baser & GITS_BASER_VALID))
+               return 0;
+
+       list_sort(NULL, &its->device_list, vgic_its_device_cmp);
+
+       list_for_each_entry(dev, &its->device_list, dev_list) {
+               int ret;
+               gpa_t eaddr;
+
+               if (!vgic_its_check_id(its, baser,
+                                      dev->device_id, &eaddr))
+                       return -EINVAL;
+
+               ret = vgic_its_save_itt(its, dev);
+               if (ret)
+                       return ret;
+
+               ret = vgic_its_save_dte(its, dev, eaddr, dte_esz);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+/**
+ * handle_l1_dte - callback used for L1 device table entries (2 stage case)
+ *
+ * @its: its handle
+ * @id: index of the entry in the L1 table
+ * @addr: kernel VA
+ * @opaque: unused
+ *
+ * L1 table entries are scanned by steps of 1 entry
+ * Return < 0 if error, 0 if last dte was found when scanning the L2
+ * table, +1 otherwise (meaning next L1 entry must be scanned)
+ */
+static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr,
+                        void *opaque)
+{
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       int l2_start_id = id * (SZ_64K / abi->dte_esz);
+       u64 entry = *(u64 *)addr;
+       int dte_esz = abi->dte_esz;
+       gpa_t gpa;
+       int ret;
+
+       entry = le64_to_cpu(entry);
+
+       if (!(entry & KVM_ITS_L1E_VALID_MASK))
+               return 1;
+
+       gpa = entry & KVM_ITS_L1E_ADDR_MASK;
+
+       ret = scan_its_table(its, gpa, SZ_64K, dte_esz,
+                            l2_start_id, vgic_its_restore_dte, NULL);
+
+       return ret;
+}
+
+/**
+ * vgic_its_restore_device_tables - Restore the device table and all ITT
+ * from guest RAM to internal data structs
+ */
+static int vgic_its_restore_device_tables(struct vgic_its *its)
+{
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       u64 baser = its->baser_device_table;
+       int l1_esz, ret;
+       int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+       gpa_t l1_gpa;
+
+       if (!(baser & GITS_BASER_VALID))
+               return 0;
+
+       l1_gpa = GITS_BASER_ADDR_48_to_52(baser);
+
+       if (baser & GITS_BASER_INDIRECT) {
+               l1_esz = GITS_LVL1_ENTRY_SIZE;
+               ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
+                                    handle_l1_dte, NULL);
+       } else {
+               l1_esz = abi->dte_esz;
+               ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
+                                    vgic_its_restore_dte, NULL);
+       }
+
+       /* scan_its_table returns +1 if all entries are invalid */
+       if (ret > 0)
+               ret = 0;
+
+       return ret;
+}
+
+static int vgic_its_save_cte(struct vgic_its *its,
+                            struct its_collection *collection,
+                            gpa_t gpa, int esz)
+{
+       u64 val;
+
+       val = (1ULL << KVM_ITS_CTE_VALID_SHIFT |
+              ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
+              collection->collection_id);
+       val = cpu_to_le64(val);
+       return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz);
+}
+
+static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
+{
+       struct its_collection *collection;
+       struct kvm *kvm = its->dev->kvm;
+       u32 target_addr, coll_id;
+       u64 val;
+       int ret;
+
+       BUG_ON(esz > sizeof(val));
+       ret = kvm_read_guest_lock(kvm, gpa, &val, esz);
+       if (ret)
+               return ret;
+       val = le64_to_cpu(val);
+       if (!(val & KVM_ITS_CTE_VALID_MASK))
+               return 0;
+
+       target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT);
+       coll_id = val & KVM_ITS_CTE_ICID_MASK;
+
+       if (target_addr != COLLECTION_NOT_MAPPED &&
+           target_addr >= atomic_read(&kvm->online_vcpus))
+               return -EINVAL;
+
+       collection = find_collection(its, coll_id);
+       if (collection)
+               return -EEXIST;
+       ret = vgic_its_alloc_collection(its, &collection, coll_id);
+       if (ret)
+               return ret;
+       collection->target_addr = target_addr;
+       return 1;
+}
+
+/**
+ * vgic_its_save_collection_table - Save the collection table into
+ * guest RAM
+ */
+static int vgic_its_save_collection_table(struct vgic_its *its)
+{
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       u64 baser = its->baser_coll_table;
+       gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser);
+       struct its_collection *collection;
+       u64 val;
+       size_t max_size, filled = 0;
+       int ret, cte_esz = abi->cte_esz;
+
+       if (!(baser & GITS_BASER_VALID))
+               return 0;
+
+       max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+
+       list_for_each_entry(collection, &its->collection_list, coll_list) {
+               ret = vgic_its_save_cte(its, collection, gpa, cte_esz);
+               if (ret)
+                       return ret;
+               gpa += cte_esz;
+               filled += cte_esz;
+       }
+
+       if (filled == max_size)
+               return 0;
+
+       /*
+        * table is not fully filled, add a last dummy element
+        * with valid bit unset
+        */
+       val = 0;
+       BUG_ON(cte_esz > sizeof(val));
+       ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz);
+       return ret;
+}
+
+/**
+ * vgic_its_restore_collection_table - reads the collection table
+ * in guest memory and restores the ITS internal state. Requires the
+ * BASER registers to be restored before.
+ */
+static int vgic_its_restore_collection_table(struct vgic_its *its)
+{
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       u64 baser = its->baser_coll_table;
+       int cte_esz = abi->cte_esz;
+       size_t max_size, read = 0;
+       gpa_t gpa;
+       int ret;
+
+       if (!(baser & GITS_BASER_VALID))
+               return 0;
+
+       gpa = GITS_BASER_ADDR_48_to_52(baser);
+
+       max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+
+       while (read < max_size) {
+               ret = vgic_its_restore_cte(its, gpa, cte_esz);
+               if (ret <= 0)
+                       break;
+               gpa += cte_esz;
+               read += cte_esz;
+       }
+
+       if (ret > 0)
+               return 0;
+
+       return ret;
+}
+
+/**
+ * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM
+ * according to v0 ABI
+ */
+static int vgic_its_save_tables_v0(struct vgic_its *its)
+{
+       int ret;
+
+       ret = vgic_its_save_device_tables(its);
+       if (ret)
+               return ret;
+
+       return vgic_its_save_collection_table(its);
+}
+
+/**
+ * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM
+ * to internal data structs according to V0 ABI
+ *
+ */
+static int vgic_its_restore_tables_v0(struct vgic_its *its)
+{
+       int ret;
+
+       ret = vgic_its_restore_collection_table(its);
+       if (ret)
+               return ret;
+
+       return vgic_its_restore_device_tables(its);
+}
+
+static int vgic_its_commit_v0(struct vgic_its *its)
+{
+       const struct vgic_its_abi *abi;
+
+       abi = vgic_its_get_abi(its);
+       its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
+       its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
+
+       its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5)
+                                       << GITS_BASER_ENTRY_SIZE_SHIFT);
+
+       its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5)
+                                       << GITS_BASER_ENTRY_SIZE_SHIFT);
+       return 0;
+}
+
+static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its)
+{
+       /* We need to keep the ABI specific field values */
+       its->baser_coll_table &= ~GITS_BASER_VALID;
+       its->baser_device_table &= ~GITS_BASER_VALID;
+       its->cbaser = 0;
+       its->creadr = 0;
+       its->cwriter = 0;
+       its->enabled = 0;
+       vgic_its_free_device_list(kvm, its);
+       vgic_its_free_collection_list(kvm, its);
+}
+
+static int vgic_its_has_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR:
+               switch (attr->attr) {
+               case KVM_VGIC_ITS_ADDR_TYPE:
+                       return 0;
+               }
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       return 0;
+               case KVM_DEV_ARM_ITS_CTRL_RESET:
+                       return 0;
+               case KVM_DEV_ARM_ITS_SAVE_TABLES:
+                       return 0;
+               case KVM_DEV_ARM_ITS_RESTORE_TABLES:
+                       return 0;
+               }
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_ITS_REGS:
+               return vgic_its_has_attr_regs(dev, attr);
+       }
+       return -ENXIO;
+}
+
+static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
+{
+       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+       int ret = 0;
+
+       if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
+               return 0;
+
+       mutex_lock(&kvm->lock);
+       mutex_lock(&its->its_lock);
+
+       if (!lock_all_vcpus(kvm)) {
+               mutex_unlock(&its->its_lock);
+               mutex_unlock(&kvm->lock);
+               return -EBUSY;
+       }
+
+       switch (attr) {
+       case KVM_DEV_ARM_ITS_CTRL_RESET:
+               vgic_its_reset(kvm, its);
+               break;
+       case KVM_DEV_ARM_ITS_SAVE_TABLES:
+               ret = abi->save_tables(its);
+               break;
+       case KVM_DEV_ARM_ITS_RESTORE_TABLES:
+               ret = abi->restore_tables(its);
+               break;
+       }
+
+       unlock_all_vcpus(kvm);
+       mutex_unlock(&its->its_lock);
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+static int vgic_its_set_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       struct vgic_its *its = dev->private;
+       int ret;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               unsigned long type = (unsigned long)attr->attr;
+               u64 addr;
+
+               if (type != KVM_VGIC_ITS_ADDR_TYPE)
+                       return -ENODEV;
+
+               if (copy_from_user(&addr, uaddr, sizeof(addr)))
+                       return -EFAULT;
+
+               ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
+                                       addr, SZ_64K);
+               if (ret)
+                       return ret;
+
+               return vgic_register_its_iodev(dev->kvm, its, addr);
+       }
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               return vgic_its_ctrl(dev->kvm, its, attr->attr);
+       case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               u64 reg;
+
+               if (get_user(reg, uaddr))
+                       return -EFAULT;
+
+               return vgic_its_attr_regs_access(dev, attr, &reg, true);
+       }
+       }
+       return -ENXIO;
+}
+
+static int vgic_its_get_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               struct vgic_its *its = dev->private;
+               u64 addr = its->vgic_its_base;
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               unsigned long type = (unsigned long)attr->attr;
+
+               if (type != KVM_VGIC_ITS_ADDR_TYPE)
+                       return -ENODEV;
+
+               if (copy_to_user(uaddr, &addr, sizeof(addr)))
+                       return -EFAULT;
+               break;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               u64 reg;
+               int ret;
+
+               ret = vgic_its_attr_regs_access(dev, attr, &reg, false);
+               if (ret)
+                       return ret;
+               return put_user(reg, uaddr);
+       }
+       default:
+               return -ENXIO;
+       }
+
+       return 0;
+}
+
+static struct kvm_device_ops kvm_arm_vgic_its_ops = {
+       .name = "kvm-arm-vgic-its",
+       .create = vgic_its_create,
+       .destroy = vgic_its_destroy,
+       .set_attr = vgic_its_set_attr,
+       .get_attr = vgic_its_get_attr,
+       .has_attr = vgic_its_has_attr,
+};
+
+int kvm_vgic_register_its_device(void)
+{
+       return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
+                                      KVM_DEV_TYPE_ARM_VGIC_ITS);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
new file mode 100644 (file)
index 0000000..4441967
--- /dev/null
@@ -0,0 +1,741 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGIC: KVM DEVICE API
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_mmu.h>
+#include <asm/cputype.h>
+#include "vgic.h"
+
+/* common helpers */
+
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+                     phys_addr_t addr, phys_addr_t alignment)
+{
+       if (addr & ~kvm_phys_mask(kvm))
+               return -E2BIG;
+
+       if (!IS_ALIGNED(addr, alignment))
+               return -EINVAL;
+
+       if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
+               return -EEXIST;
+
+       return 0;
+}
+
+static int vgic_check_type(struct kvm *kvm, int type_needed)
+{
+       if (kvm->arch.vgic.vgic_model != type_needed)
+               return -ENODEV;
+       else
+               return 0;
+}
+
+/**
+ * kvm_vgic_addr - set or get vgic VM base addresses
+ * @kvm:   pointer to the vm struct
+ * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
+ * @addr:  pointer to address value
+ * @write: if true set the address in the VM address space, if false read the
+ *          address
+ *
+ * Set or get the vgic base addresses for the distributor and the virtual CPU
+ * interface in the VM physical address space.  These addresses are properties
+ * of the emulated core/SoC and therefore user space initially knows this
+ * information.
+ * Check them for sanity (alignment, double assignment). We can't check for
+ * overlapping regions in case of a virtual GICv3 here, since we don't know
+ * the number of VCPUs yet, so we defer this check to map_resources().
+ */
+int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
+{
+       int r = 0;
+       struct vgic_dist *vgic = &kvm->arch.vgic;
+       phys_addr_t *addr_ptr, alignment;
+       u64 undef_value = VGIC_ADDR_UNDEF;
+
+       mutex_lock(&kvm->lock);
+       switch (type) {
+       case KVM_VGIC_V2_ADDR_TYPE_DIST:
+               r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+               addr_ptr = &vgic->vgic_dist_base;
+               alignment = SZ_4K;
+               break;
+       case KVM_VGIC_V2_ADDR_TYPE_CPU:
+               r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+               addr_ptr = &vgic->vgic_cpu_base;
+               alignment = SZ_4K;
+               break;
+       case KVM_VGIC_V3_ADDR_TYPE_DIST:
+               r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
+               addr_ptr = &vgic->vgic_dist_base;
+               alignment = SZ_64K;
+               break;
+       case KVM_VGIC_V3_ADDR_TYPE_REDIST: {
+               struct vgic_redist_region *rdreg;
+
+               r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
+               if (r)
+                       break;
+               if (write) {
+                       r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
+                       goto out;
+               }
+               rdreg = list_first_entry(&vgic->rd_regions,
+                                        struct vgic_redist_region, list);
+               if (!rdreg)
+                       addr_ptr = &undef_value;
+               else
+                       addr_ptr = &rdreg->base;
+               break;
+       }
+       case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
+       {
+               struct vgic_redist_region *rdreg;
+               u8 index;
+
+               r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
+               if (r)
+                       break;
+
+               index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK;
+
+               if (write) {
+                       gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK;
+                       u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK)
+                                       >> KVM_VGIC_V3_RDIST_COUNT_SHIFT;
+                       u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK)
+                                       >> KVM_VGIC_V3_RDIST_FLAGS_SHIFT;
+
+                       if (!count || flags)
+                               r = -EINVAL;
+                       else
+                               r = vgic_v3_set_redist_base(kvm, index,
+                                                           base, count);
+                       goto out;
+               }
+
+               rdreg = vgic_v3_rdist_region_from_index(kvm, index);
+               if (!rdreg) {
+                       r = -ENOENT;
+                       goto out;
+               }
+
+               *addr = index;
+               *addr |= rdreg->base;
+               *addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT;
+               goto out;
+       }
+       default:
+               r = -ENODEV;
+       }
+
+       if (r)
+               goto out;
+
+       if (write) {
+               r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment);
+               if (!r)
+                       *addr_ptr = *addr;
+       } else {
+               *addr = *addr_ptr;
+       }
+
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+static int vgic_set_common_attr(struct kvm_device *dev,
+                               struct kvm_device_attr *attr)
+{
+       int r;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               u64 addr;
+               unsigned long type = (unsigned long)attr->attr;
+
+               if (copy_from_user(&addr, uaddr, sizeof(addr)))
+                       return -EFAULT;
+
+               r = kvm_vgic_addr(dev->kvm, type, &addr, true);
+               return (r == -ENODEV) ? -ENXIO : r;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u32 val;
+               int ret = 0;
+
+               if (get_user(val, uaddr))
+                       return -EFAULT;
+
+               /*
+                * We require:
+                * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
+                * - at most 1024 interrupts
+                * - a multiple of 32 interrupts
+                */
+               if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
+                   val > VGIC_MAX_RESERVED ||
+                   (val & 31))
+                       return -EINVAL;
+
+               mutex_lock(&dev->kvm->lock);
+
+               if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis)
+                       ret = -EBUSY;
+               else
+                       dev->kvm->arch.vgic.nr_spis =
+                               val - VGIC_NR_PRIVATE_IRQS;
+
+               mutex_unlock(&dev->kvm->lock);
+
+               return ret;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_CTRL: {
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       mutex_lock(&dev->kvm->lock);
+                       r = vgic_init(dev->kvm);
+                       mutex_unlock(&dev->kvm->lock);
+                       return r;
+               }
+               break;
+       }
+       }
+
+       return -ENXIO;
+}
+
+static int vgic_get_common_attr(struct kvm_device *dev,
+                               struct kvm_device_attr *attr)
+{
+       int r = -ENXIO;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               u64 addr;
+               unsigned long type = (unsigned long)attr->attr;
+
+               r = kvm_vgic_addr(dev->kvm, type, &addr, false);
+               if (r)
+                       return (r == -ENODEV) ? -ENXIO : r;
+
+               if (copy_to_user(uaddr, &addr, sizeof(addr)))
+                       return -EFAULT;
+               break;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+
+               r = put_user(dev->kvm->arch.vgic.nr_spis +
+                            VGIC_NR_PRIVATE_IRQS, uaddr);
+               break;
+       }
+       }
+
+       return r;
+}
+
+static int vgic_create(struct kvm_device *dev, u32 type)
+{
+       return kvm_vgic_create(dev->kvm, type);
+}
+
+static void vgic_destroy(struct kvm_device *dev)
+{
+       kfree(dev);
+}
+
+int kvm_register_vgic_device(unsigned long type)
+{
+       int ret = -ENODEV;
+
+       switch (type) {
+       case KVM_DEV_TYPE_ARM_VGIC_V2:
+               ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+                                             KVM_DEV_TYPE_ARM_VGIC_V2);
+               break;
+       case KVM_DEV_TYPE_ARM_VGIC_V3:
+               ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
+                                             KVM_DEV_TYPE_ARM_VGIC_V3);
+
+               if (ret)
+                       break;
+               ret = kvm_vgic_register_its_device();
+               break;
+       }
+
+       return ret;
+}
+
+int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+                      struct vgic_reg_attr *reg_attr)
+{
+       int cpuid;
+
+       cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
+                KVM_DEV_ARM_VGIC_CPUID_SHIFT;
+
+       if (cpuid >= atomic_read(&dev->kvm->online_vcpus))
+               return -EINVAL;
+
+       reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid);
+       reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+
+       return 0;
+}
+
+/* unlocks vcpus from @vcpu_lock_idx and smaller */
+static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
+{
+       struct kvm_vcpu *tmp_vcpu;
+
+       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+               tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+               mutex_unlock(&tmp_vcpu->mutex);
+       }
+}
+
+void unlock_all_vcpus(struct kvm *kvm)
+{
+       unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
+}
+
+/* Returns true if all vcpus were locked, false otherwise */
+bool lock_all_vcpus(struct kvm *kvm)
+{
+       struct kvm_vcpu *tmp_vcpu;
+       int c;
+
+       /*
+        * Any time a vcpu is run, vcpu_load is called which tries to grab the
+        * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
+        * that no other VCPUs are run and fiddle with the vgic state while we
+        * access it.
+        */
+       kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
+               if (!mutex_trylock(&tmp_vcpu->mutex)) {
+                       unlock_vcpus(kvm, c - 1);
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+/**
+ * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state
+ *
+ * @dev:      kvm device handle
+ * @attr:     kvm device attribute
+ * @reg:      address the value is read or written
+ * @is_write: true if userspace is writing a register
+ */
+static int vgic_v2_attr_regs_access(struct kvm_device *dev,
+                                   struct kvm_device_attr *attr,
+                                   u32 *reg, bool is_write)
+{
+       struct vgic_reg_attr reg_attr;
+       gpa_t addr;
+       struct kvm_vcpu *vcpu;
+       int ret;
+
+       ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
+       if (ret)
+               return ret;
+
+       vcpu = reg_attr.vcpu;
+       addr = reg_attr.addr;
+
+       mutex_lock(&dev->kvm->lock);
+
+       ret = vgic_init(dev->kvm);
+       if (ret)
+               goto out;
+
+       if (!lock_all_vcpus(dev->kvm)) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+               ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg);
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+               ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       unlock_all_vcpus(dev->kvm);
+out:
+       mutex_unlock(&dev->kvm->lock);
+       return ret;
+}
+
+static int vgic_v2_set_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       int ret;
+
+       ret = vgic_set_common_attr(dev, attr);
+       if (ret != -ENXIO)
+               return ret;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u32 reg;
+
+               if (get_user(reg, uaddr))
+                       return -EFAULT;
+
+               return vgic_v2_attr_regs_access(dev, attr, &reg, true);
+       }
+       }
+
+       return -ENXIO;
+}
+
+static int vgic_v2_get_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       int ret;
+
+       ret = vgic_get_common_attr(dev, attr);
+       if (ret != -ENXIO)
+               return ret;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u32 reg = 0;
+
+               ret = vgic_v2_attr_regs_access(dev, attr, &reg, false);
+               if (ret)
+                       return ret;
+               return put_user(reg, uaddr);
+       }
+       }
+
+       return -ENXIO;
+}
+
+static int vgic_v2_has_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR:
+               switch (attr->attr) {
+               case KVM_VGIC_V2_ADDR_TYPE_DIST:
+               case KVM_VGIC_V2_ADDR_TYPE_CPU:
+                       return 0;
+               }
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+               return vgic_v2_has_attr_regs(dev, attr);
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+               return 0;
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       return 0;
+               }
+       }
+       return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v2_ops = {
+       .name = "kvm-arm-vgic-v2",
+       .create = vgic_create,
+       .destroy = vgic_destroy,
+       .set_attr = vgic_v2_set_attr,
+       .get_attr = vgic_v2_get_attr,
+       .has_attr = vgic_v2_has_attr,
+};
+
+int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+                      struct vgic_reg_attr *reg_attr)
+{
+       unsigned long vgic_mpidr, mpidr_reg;
+
+       /*
+        * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group,
+        * attr might not hold MPIDR. Hence assume vcpu0.
+        */
+       if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) {
+               vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >>
+                             KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT;
+
+               mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr);
+               reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg);
+       } else {
+               reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0);
+       }
+
+       if (!reg_attr->vcpu)
+               return -EINVAL;
+
+       reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+
+       return 0;
+}
+
+/*
+ * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state
+ *
+ * @dev:      kvm device handle
+ * @attr:     kvm device attribute
+ * @reg:      address the value is read or written
+ * @is_write: true if userspace is writing a register
+ */
+static int vgic_v3_attr_regs_access(struct kvm_device *dev,
+                                   struct kvm_device_attr *attr,
+                                   u64 *reg, bool is_write)
+{
+       struct vgic_reg_attr reg_attr;
+       gpa_t addr;
+       struct kvm_vcpu *vcpu;
+       int ret;
+       u32 tmp32;
+
+       ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
+       if (ret)
+               return ret;
+
+       vcpu = reg_attr.vcpu;
+       addr = reg_attr.addr;
+
+       mutex_lock(&dev->kvm->lock);
+
+       if (unlikely(!vgic_initialized(dev->kvm))) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       if (!lock_all_vcpus(dev->kvm)) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+               if (is_write)
+                       tmp32 = *reg;
+
+               ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32);
+               if (!is_write)
+                       *reg = tmp32;
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
+               if (is_write)
+                       tmp32 = *reg;
+
+               ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32);
+               if (!is_write)
+                       *reg = tmp32;
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+               u64 regid;
+
+               regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
+               ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write,
+                                                 regid, reg);
+               break;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+               unsigned int info, intid;
+
+               info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
+                       KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT;
+               if (info == VGIC_LEVEL_INFO_LINE_LEVEL) {
+                       intid = attr->attr &
+                               KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK;
+                       ret = vgic_v3_line_level_info_uaccess(vcpu, is_write,
+                                                             intid, reg);
+               } else {
+                       ret = -EINVAL;
+               }
+               break;
+       }
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       unlock_all_vcpus(dev->kvm);
+out:
+       mutex_unlock(&dev->kvm->lock);
+       return ret;
+}
+
+static int vgic_v3_set_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       int ret;
+
+       ret = vgic_set_common_attr(dev, attr);
+       if (ret != -ENXIO)
+               return ret;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u32 tmp32;
+               u64 reg;
+
+               if (get_user(tmp32, uaddr))
+                       return -EFAULT;
+
+               reg = tmp32;
+               return vgic_v3_attr_regs_access(dev, attr, &reg, true);
+       }
+       case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               u64 reg;
+
+               if (get_user(reg, uaddr))
+                       return -EFAULT;
+
+               return vgic_v3_attr_regs_access(dev, attr, &reg, true);
+       }
+       case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u64 reg;
+               u32 tmp32;
+
+               if (get_user(tmp32, uaddr))
+                       return -EFAULT;
+
+               reg = tmp32;
+               return vgic_v3_attr_regs_access(dev, attr, &reg, true);
+       }
+       case KVM_DEV_ARM_VGIC_GRP_CTRL: {
+               int ret;
+
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
+                       mutex_lock(&dev->kvm->lock);
+
+                       if (!lock_all_vcpus(dev->kvm)) {
+                               mutex_unlock(&dev->kvm->lock);
+                               return -EBUSY;
+                       }
+                       ret = vgic_v3_save_pending_tables(dev->kvm);
+                       unlock_all_vcpus(dev->kvm);
+                       mutex_unlock(&dev->kvm->lock);
+                       return ret;
+               }
+               break;
+       }
+       }
+       return -ENXIO;
+}
+
+static int vgic_v3_get_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       int ret;
+
+       ret = vgic_get_common_attr(dev, attr);
+       if (ret != -ENXIO)
+               return ret;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u64 reg;
+               u32 tmp32;
+
+               ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
+               if (ret)
+                       return ret;
+               tmp32 = reg;
+               return put_user(tmp32, uaddr);
+       }
+       case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               u64 reg;
+
+               ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
+               if (ret)
+                       return ret;
+               return put_user(reg, uaddr);
+       }
+       case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u64 reg;
+               u32 tmp32;
+
+               ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
+               if (ret)
+                       return ret;
+               tmp32 = reg;
+               return put_user(tmp32, uaddr);
+       }
+       }
+       return -ENXIO;
+}
+
+static int vgic_v3_has_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR:
+               switch (attr->attr) {
+               case KVM_VGIC_V3_ADDR_TYPE_DIST:
+               case KVM_VGIC_V3_ADDR_TYPE_REDIST:
+               case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
+                       return 0;
+               }
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
+               return vgic_v3_has_attr_regs(dev, attr);
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+               return 0;
+       case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+               if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
+                     KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) ==
+                     VGIC_LEVEL_INFO_LINE_LEVEL)
+                       return 0;
+               break;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       return 0;
+               case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
+                       return 0;
+               }
+       }
+       return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v3_ops = {
+       .name = "kvm-arm-vgic-v3",
+       .create = vgic_create,
+       .destroy = vgic_destroy,
+       .set_attr = vgic_v3_set_attr,
+       .get_attr = vgic_v3_get_attr,
+       .has_attr = vgic_v3_has_attr,
+};
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
new file mode 100644 (file)
index 0000000..a016f07
--- /dev/null
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGICv2 MMIO handling functions
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/nospec.h>
+
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/*
+ * The Revision field in the IIDR have the following meanings:
+ *
+ * Revision 1: Report GICv2 interrupts as group 0 instead of group 1
+ * Revision 2: Interrupt groups are guest-configurable and signaled using
+ *            their configured groups.
+ */
+
+static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
+       u32 value;
+
+       switch (addr & 0x0c) {
+       case GIC_DIST_CTRL:
+               value = vgic->enabled ? GICD_ENABLE : 0;
+               break;
+       case GIC_DIST_CTR:
+               value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
+               value = (value >> 5) - 1;
+               value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
+               break;
+       case GIC_DIST_IIDR:
+               value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
+                       (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
+                       (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
+               break;
+       default:
+               return 0;
+       }
+
+       return value;
+}
+
+static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   unsigned long val)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       bool was_enabled = dist->enabled;
+
+       switch (addr & 0x0c) {
+       case GIC_DIST_CTRL:
+               dist->enabled = val & GICD_ENABLE;
+               if (!was_enabled && dist->enabled)
+                       vgic_kick_vcpus(vcpu->kvm);
+               break;
+       case GIC_DIST_CTR:
+       case GIC_DIST_IIDR:
+               /* Nothing to do */
+               return;
+       }
+}
+
+static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu,
+                                          gpa_t addr, unsigned int len,
+                                          unsigned long val)
+{
+       switch (addr & 0x0c) {
+       case GIC_DIST_IIDR:
+               if (val != vgic_mmio_read_v2_misc(vcpu, addr, len))
+                       return -EINVAL;
+
+               /*
+                * If we observe a write to GICD_IIDR we know that userspace
+                * has been updated and has had a chance to cope with older
+                * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting
+                * interrupts as group 1, and therefore we now allow groups to
+                * be user writable.  Doing this by default would break
+                * migration from old kernels to new kernels with legacy
+                * userspace.
+                */
+               vcpu->kvm->arch.vgic.v2_groups_user_writable = true;
+               return 0;
+       }
+
+       vgic_mmio_write_v2_misc(vcpu, addr, len, val);
+       return 0;
+}
+
+static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len,
+                                           unsigned long val)
+{
+       if (vcpu->kvm->arch.vgic.v2_groups_user_writable)
+               vgic_mmio_write_group(vcpu, addr, len, val);
+
+       return 0;
+}
+
+static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
+                                gpa_t addr, unsigned int len,
+                                unsigned long val)
+{
+       int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus);
+       int intid = val & 0xf;
+       int targets = (val >> 16) & 0xff;
+       int mode = (val >> 24) & 0x03;
+       int c;
+       struct kvm_vcpu *vcpu;
+       unsigned long flags;
+
+       switch (mode) {
+       case 0x0:               /* as specified by targets */
+               break;
+       case 0x1:
+               targets = (1U << nr_vcpus) - 1;                 /* all, ... */
+               targets &= ~(1U << source_vcpu->vcpu_id);       /* but self */
+               break;
+       case 0x2:               /* this very vCPU only */
+               targets = (1U << source_vcpu->vcpu_id);
+               break;
+       case 0x3:               /* reserved */
+               return;
+       }
+
+       kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) {
+               struct vgic_irq *irq;
+
+               if (!(targets & (1U << c)))
+                       continue;
+
+               irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               irq->pending_latch = true;
+               irq->source |= 1U << source_vcpu->vcpu_id;
+
+               vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags);
+               vgic_put_irq(source_vcpu->kvm, irq);
+       }
+}
+
+static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
+                                          gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       int i;
+       u64 val = 0;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               val |= (u64)irq->targets << (i * 8);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return val;
+}
+
+static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
+                                  gpa_t addr, unsigned int len,
+                                  unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0);
+       int i;
+       unsigned long flags;
+
+       /* GICD_ITARGETSR[0-7] are read-only */
+       if (intid < VGIC_NR_PRIVATE_IRQS)
+               return;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
+               int target;
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+               irq->targets = (val >> (i * 8)) & cpu_mask;
+               target = irq->targets ? __ffs(irq->targets) : 0;
+               irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
+
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       u32 intid = addr & 0x0f;
+       int i;
+       u64 val = 0;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               val |= (u64)irq->source << (i * 8);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+       return val;
+}
+
+static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       u32 intid = addr & 0x0f;
+       int i;
+       unsigned long flags;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+               irq->source &= ~((val >> (i * 8)) & 0xff);
+               if (!irq->source)
+                       irq->pending_latch = false;
+
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       u32 intid = addr & 0x0f;
+       int i;
+       unsigned long flags;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+               irq->source |= (val >> (i * 8)) & 0xff;
+
+               if (irq->source) {
+                       irq->pending_latch = true;
+                       vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+               } else {
+                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               }
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+#define GICC_ARCH_VERSION_V2   0x2
+
+/* These are for userland accesses only, there is no guest-facing emulation. */
+static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
+                                          gpa_t addr, unsigned int len)
+{
+       struct vgic_vmcr vmcr;
+       u32 val;
+
+       vgic_get_vmcr(vcpu, &vmcr);
+
+       switch (addr & 0xff) {
+       case GIC_CPU_CTRL:
+               val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT;
+               val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT;
+               val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT;
+               val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT;
+               val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT;
+               val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT;
+
+               break;
+       case GIC_CPU_PRIMASK:
+               /*
+                * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+                * the PMR field as GICH_VMCR.VMPriMask rather than
+                * GICC_PMR.Priority, so we expose the upper five bits of
+                * priority mask to userspace using the lower bits in the
+                * unsigned long.
+                */
+               val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >>
+                       GICV_PMR_PRIORITY_SHIFT;
+               break;
+       case GIC_CPU_BINPOINT:
+               val = vmcr.bpr;
+               break;
+       case GIC_CPU_ALIAS_BINPOINT:
+               val = vmcr.abpr;
+               break;
+       case GIC_CPU_IDENT:
+               val = ((PRODUCT_ID_KVM << 20) |
+                      (GICC_ARCH_VERSION_V2 << 16) |
+                      IMPLEMENTER_ARM);
+               break;
+       default:
+               return 0;
+       }
+
+       return val;
+}
+
+static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
+                                  gpa_t addr, unsigned int len,
+                                  unsigned long val)
+{
+       struct vgic_vmcr vmcr;
+
+       vgic_get_vmcr(vcpu, &vmcr);
+
+       switch (addr & 0xff) {
+       case GIC_CPU_CTRL:
+               vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0);
+               vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1);
+               vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl);
+               vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn);
+               vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR);
+               vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS);
+
+               break;
+       case GIC_CPU_PRIMASK:
+               /*
+                * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+                * the PMR field as GICH_VMCR.VMPriMask rather than
+                * GICC_PMR.Priority, so we expose the upper five bits of
+                * priority mask to userspace using the lower bits in the
+                * unsigned long.
+                */
+               vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) &
+                       GICV_PMR_PRIORITY_MASK;
+               break;
+       case GIC_CPU_BINPOINT:
+               vmcr.bpr = val;
+               break;
+       case GIC_CPU_ALIAS_BINPOINT:
+               vmcr.abpr = val;
+               break;
+       }
+
+       vgic_set_vmcr(vcpu, &vmcr);
+}
+
+static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu,
+                                       gpa_t addr, unsigned int len)
+{
+       int n; /* which APRn is this */
+
+       n = (addr >> 2) & 0x3;
+
+       if (kvm_vgic_global_state.type == VGIC_V2) {
+               /* GICv2 hardware systems support max. 32 groups */
+               if (n != 0)
+                       return 0;
+               return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr;
+       } else {
+               struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+               if (n > vgic_v3_max_apr_idx(vcpu))
+                       return 0;
+
+               n = array_index_nospec(n, 4);
+
+               /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
+               return vgicv3->vgic_ap1r[n];
+       }
+}
+
+static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu,
+                               gpa_t addr, unsigned int len,
+                               unsigned long val)
+{
+       int n; /* which APRn is this */
+
+       n = (addr >> 2) & 0x3;
+
+       if (kvm_vgic_global_state.type == VGIC_V2) {
+               /* GICv2 hardware systems support max. 32 groups */
+               if (n != 0)
+                       return;
+               vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val;
+       } else {
+               struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+               if (n > vgic_v3_max_apr_idx(vcpu))
+                       return;
+
+               n = array_index_nospec(n, 4);
+
+               /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
+               vgicv3->vgic_ap1r[n] = val;
+       }
+}
+
+static const struct vgic_register_region vgic_v2_dist_registers[] = {
+       REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL,
+               vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc,
+               NULL, vgic_mmio_uaccess_write_v2_misc,
+               12, VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
+               vgic_mmio_read_group, vgic_mmio_write_group,
+               NULL, vgic_mmio_uaccess_write_v2_group, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET,
+               vgic_mmio_read_enable, vgic_mmio_write_senable,
+               NULL, vgic_uaccess_write_senable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR,
+               vgic_mmio_read_enable, vgic_mmio_write_cenable,
+               NULL, vgic_uaccess_write_cenable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
+               vgic_mmio_read_pending, vgic_mmio_write_spending,
+               NULL, vgic_uaccess_write_spending, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
+               vgic_mmio_read_pending, vgic_mmio_write_cpending,
+               NULL, vgic_uaccess_write_cpending, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
+               vgic_mmio_read_active, vgic_mmio_write_sactive,
+               vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
+               vgic_mmio_read_active, vgic_mmio_write_cactive,
+               vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
+               vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
+               8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET,
+               vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG,
+               vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT,
+               vgic_mmio_read_raz, vgic_mmio_write_sgir, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR,
+               vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET,
+               vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+};
+
+static const struct vgic_register_region vgic_v2_cpu_registers[] = {
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO,
+               vgic_mmio_read_apr, vgic_mmio_write_apr, 16,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+};
+
+unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
+{
+       dev->regions = vgic_v2_dist_registers;
+       dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
+
+       kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+       return SZ_4K;
+}
+
+int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       const struct vgic_register_region *region;
+       struct vgic_io_device iodev;
+       struct vgic_reg_attr reg_attr;
+       struct kvm_vcpu *vcpu;
+       gpa_t addr;
+       int ret;
+
+       ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
+       if (ret)
+               return ret;
+
+       vcpu = reg_attr.vcpu;
+       addr = reg_attr.addr;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+               iodev.regions = vgic_v2_dist_registers;
+               iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
+               iodev.base_addr = 0;
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+               iodev.regions = vgic_v2_cpu_registers;
+               iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
+               iodev.base_addr = 0;
+               break;
+       default:
+               return -ENXIO;
+       }
+
+       /* We only support aligned 32-bit accesses. */
+       if (addr & 3)
+               return -ENXIO;
+
+       region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
+       if (!region)
+               return -ENXIO;
+
+       return 0;
+}
+
+int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                         int offset, u32 *val)
+{
+       struct vgic_io_device dev = {
+               .regions = vgic_v2_cpu_registers,
+               .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
+               .iodev_type = IODEV_CPUIF,
+       };
+
+       return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
+
+int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                        int offset, u32 *val)
+{
+       struct vgic_io_device dev = {
+               .regions = vgic_v2_dist_registers,
+               .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
+               .iodev_type = IODEV_DIST,
+       };
+
+       return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
new file mode 100644 (file)
index 0000000..89a14ec
--- /dev/null
@@ -0,0 +1,1063 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGICv3 MMIO handling functions
+ */
+
+#include <linux/bitfield.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/* extract @num bytes at @offset bytes offset in data */
+unsigned long extract_bytes(u64 data, unsigned int offset,
+                           unsigned int num)
+{
+       return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
+}
+
+/* allows updates of any half of a 64-bit register (or the whole thing) */
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+                    unsigned long val)
+{
+       int lower = (offset & 4) * 8;
+       int upper = lower + 8 * len - 1;
+
+       reg &= ~GENMASK_ULL(upper, lower);
+       val &= GENMASK_ULL(len * 8 - 1, 0);
+
+       return reg | ((u64)val << lower);
+}
+
+bool vgic_has_its(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+
+       if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+               return false;
+
+       return dist->has_its;
+}
+
+bool vgic_supports_direct_msis(struct kvm *kvm)
+{
+       return (kvm_vgic_global_state.has_gicv4_1 ||
+               (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm)));
+}
+
+/*
+ * The Revision field in the IIDR have the following meanings:
+ *
+ * Revision 2: Interrupt groups are guest-configurable and signaled using
+ *            their configured groups.
+ */
+
+static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
+       u32 value = 0;
+
+       switch (addr & 0x0c) {
+       case GICD_CTLR:
+               if (vgic->enabled)
+                       value |= GICD_CTLR_ENABLE_SS_G1;
+               value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
+               if (vgic->nassgireq)
+                       value |= GICD_CTLR_nASSGIreq;
+               break;
+       case GICD_TYPER:
+               value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
+               value = (value >> 5) - 1;
+               if (vgic_has_its(vcpu->kvm)) {
+                       value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
+                       value |= GICD_TYPER_LPIS;
+               } else {
+                       value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+               }
+               break;
+       case GICD_TYPER2:
+               if (kvm_vgic_global_state.has_gicv4_1)
+                       value = GICD_TYPER2_nASSGIcap;
+               break;
+       case GICD_IIDR:
+               value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
+                       (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
+                       (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
+               break;
+       default:
+               return 0;
+       }
+
+       return value;
+}
+
+static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   unsigned long val)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+       switch (addr & 0x0c) {
+       case GICD_CTLR: {
+               bool was_enabled, is_hwsgi;
+
+               mutex_lock(&vcpu->kvm->lock);
+
+               was_enabled = dist->enabled;
+               is_hwsgi = dist->nassgireq;
+
+               dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
+
+               /* Not a GICv4.1? No HW SGIs */
+               if (!kvm_vgic_global_state.has_gicv4_1)
+                       val &= ~GICD_CTLR_nASSGIreq;
+
+               /* Dist stays enabled? nASSGIreq is RO */
+               if (was_enabled && dist->enabled) {
+                       val &= ~GICD_CTLR_nASSGIreq;
+                       val |= FIELD_PREP(GICD_CTLR_nASSGIreq, is_hwsgi);
+               }
+
+               /* Switching HW SGIs? */
+               dist->nassgireq = val & GICD_CTLR_nASSGIreq;
+               if (is_hwsgi != dist->nassgireq)
+                       vgic_v4_configure_vsgis(vcpu->kvm);
+
+               if (kvm_vgic_global_state.has_gicv4_1 &&
+                   was_enabled != dist->enabled)
+                       kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4);
+               else if (!was_enabled && dist->enabled)
+                       vgic_kick_vcpus(vcpu->kvm);
+
+               mutex_unlock(&vcpu->kvm->lock);
+               break;
+       }
+       case GICD_TYPER:
+       case GICD_TYPER2:
+       case GICD_IIDR:
+               /* This is at best for documentation purposes... */
+               return;
+       }
+}
+
+static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
+                                          gpa_t addr, unsigned int len,
+                                          unsigned long val)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+       switch (addr & 0x0c) {
+       case GICD_TYPER2:
+       case GICD_IIDR:
+               if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
+                       return -EINVAL;
+               return 0;
+       case GICD_CTLR:
+               /* Not a GICv4.1? No HW SGIs */
+               if (!kvm_vgic_global_state.has_gicv4_1)
+                       val &= ~GICD_CTLR_nASSGIreq;
+
+               dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
+               dist->nassgireq = val & GICD_CTLR_nASSGIreq;
+               return 0;
+       }
+
+       vgic_mmio_write_v3_misc(vcpu, addr, len, val);
+       return 0;
+}
+
+static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       int intid = VGIC_ADDR_TO_INTID(addr, 64);
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+       unsigned long ret = 0;
+
+       if (!irq)
+               return 0;
+
+       /* The upper word is RAZ for us. */
+       if (!(addr & 4))
+               ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
+
+       vgic_put_irq(vcpu->kvm, irq);
+       return ret;
+}
+
+static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   unsigned long val)
+{
+       int intid = VGIC_ADDR_TO_INTID(addr, 64);
+       struct vgic_irq *irq;
+       unsigned long flags;
+
+       /* The upper word is WI for us since we don't implement Aff3. */
+       if (addr & 4)
+               return;
+
+       irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+       if (!irq)
+               return;
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+       /* We only care about and preserve Aff0, Aff1 and Aff2. */
+       irq->mpidr = val & GENMASK(23, 0);
+       irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
+
+       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+       vgic_put_irq(vcpu->kvm, irq);
+}
+
+static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
+}
+
+
+static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       bool was_enabled = vgic_cpu->lpis_enabled;
+
+       if (!vgic_has_its(vcpu->kvm))
+               return;
+
+       vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+
+       if (was_enabled && !vgic_cpu->lpis_enabled) {
+               vgic_flush_pending_lpis(vcpu);
+               vgic_its_invalidate_cache(vcpu->kvm);
+       }
+
+       if (!was_enabled && vgic_cpu->lpis_enabled)
+               vgic_enable_lpis(vcpu);
+}
+
+static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
+                                             gpa_t addr, unsigned int len)
+{
+       unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
+       int target_vcpu_id = vcpu->vcpu_id;
+       gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
+                       (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;
+       u64 value;
+
+       value = (u64)(mpidr & GENMASK(23, 0)) << 32;
+       value |= ((target_vcpu_id & 0xffff) << 8);
+
+       if (addr == last_rdist_typer)
+               value |= GICR_TYPER_LAST;
+       if (vgic_has_its(vcpu->kvm))
+               value |= GICR_TYPER_PLPIS;
+
+       return extract_bytes(value, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+}
+
+static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
+                                             gpa_t addr, unsigned int len)
+{
+       switch (addr & 0xffff) {
+       case GICD_PIDR2:
+               /* report a GICv3 compliant implementation */
+               return 0x3b;
+       }
+
+       return 0;
+}
+
+static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu,
+                                                 gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 value = 0;
+       int i;
+
+       /*
+        * pending state of interrupt is latched in pending_latch variable.
+        * Userspace will save and restore pending state and line_level
+        * separately.
+        * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt
+        * for handling of ISPENDR and ICPENDR.
+        */
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               bool state = irq->pending_latch;
+
+               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+                       int err;
+
+                       err = irq_get_irqchip_state(irq->host_irq,
+                                                   IRQCHIP_STATE_PENDING,
+                                                   &state);
+                       WARN_ON(err);
+               }
+
+               if (state)
+                       value |= (1U << i);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return value;
+}
+
+static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
+                                        gpa_t addr, unsigned int len,
+                                        unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+       unsigned long flags;
+
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               if (test_bit(i, &val)) {
+                       /*
+                        * pending_latch is set irrespective of irq type
+                        * (level or edge) to avoid dependency that VM should
+                        * restore irq config before pending info.
+                        */
+                       irq->pending_latch = true;
+                       vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+               } else {
+                       irq->pending_latch = false;
+                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               }
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return 0;
+}
+
+/* We want to avoid outer shareable. */
+u64 vgic_sanitise_shareability(u64 field)
+{
+       switch (field) {
+       case GIC_BASER_OuterShareable:
+               return GIC_BASER_InnerShareable;
+       default:
+               return field;
+       }
+}
+
+/* Avoid any inner non-cacheable mapping. */
+u64 vgic_sanitise_inner_cacheability(u64 field)
+{
+       switch (field) {
+       case GIC_BASER_CACHE_nCnB:
+       case GIC_BASER_CACHE_nC:
+               return GIC_BASER_CACHE_RaWb;
+       default:
+               return field;
+       }
+}
+
+/* Non-cacheable or same-as-inner are OK. */
+u64 vgic_sanitise_outer_cacheability(u64 field)
+{
+       switch (field) {
+       case GIC_BASER_CACHE_SameAsInner:
+       case GIC_BASER_CACHE_nC:
+               return field;
+       default:
+               return GIC_BASER_CACHE_nC;
+       }
+}
+
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+                       u64 (*sanitise_fn)(u64))
+{
+       u64 field = (reg & field_mask) >> field_shift;
+
+       field = sanitise_fn(field) << field_shift;
+       return (reg & ~field_mask) | field;
+}
+
+#define PROPBASER_RES0_MASK                                            \
+       (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
+#define PENDBASER_RES0_MASK                                            \
+       (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) |      \
+        GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))
+
+static u64 vgic_sanitise_pendbaser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
+                                 GICR_PENDBASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
+                                 GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
+                                 GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       reg &= ~PENDBASER_RES0_MASK;
+
+       return reg;
+}
+
+static u64 vgic_sanitise_propbaser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
+                                 GICR_PROPBASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
+                                 GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
+                                 GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       reg &= ~PROPBASER_RES0_MASK;
+       return reg;
+}
+
+static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+       return extract_bytes(dist->propbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       u64 old_propbaser, propbaser;
+
+       /* Storing a value with LPIs already enabled is undefined */
+       if (vgic_cpu->lpis_enabled)
+               return;
+
+       do {
+               old_propbaser = READ_ONCE(dist->propbaser);
+               propbaser = old_propbaser;
+               propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
+               propbaser = vgic_sanitise_propbaser(propbaser);
+       } while (cmpxchg64(&dist->propbaser, old_propbaser,
+                          propbaser) != old_propbaser);
+}
+
+static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       u64 value = vgic_cpu->pendbaser;
+
+       value &= ~GICR_PENDBASER_PTZ;
+
+       return extract_bytes(value, addr & 7, len);
+}
+
+static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       u64 old_pendbaser, pendbaser;
+
+       /* Storing a value with LPIs already enabled is undefined */
+       if (vgic_cpu->lpis_enabled)
+               return;
+
+       do {
+               old_pendbaser = READ_ONCE(vgic_cpu->pendbaser);
+               pendbaser = old_pendbaser;
+               pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
+               pendbaser = vgic_sanitise_pendbaser(pendbaser);
+       } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser,
+                          pendbaser) != old_pendbaser);
+}
+
+/*
+ * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
+ * redistributors, while SPIs are covered by registers in the distributor
+ * block. Trying to set private IRQs in this block gets ignored.
+ * We take some special care here to fix the calculation of the register
+ * offset.
+ */
+#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \
+       {                                                               \
+               .reg_offset = off,                                      \
+               .bits_per_irq = bpi,                                    \
+               .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8,                \
+               .access_flags = acc,                                    \
+               .read = vgic_mmio_read_raz,                             \
+               .write = vgic_mmio_write_wi,                            \
+       }, {                                                            \
+               .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8,   \
+               .bits_per_irq = bpi,                                    \
+               .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8,       \
+               .access_flags = acc,                                    \
+               .read = rd,                                             \
+               .write = wr,                                            \
+               .uaccess_read = ur,                                     \
+               .uaccess_write = uw,                                    \
+       }
+
+static const struct vgic_register_region vgic_v3_dist_registers[] = {
+       REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR,
+               vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc,
+               NULL, vgic_mmio_uaccess_write_v3_misc,
+               16, VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICD_STATUSR,
+               vgic_mmio_read_rao, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR,
+               vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER,
+               vgic_mmio_read_enable, vgic_mmio_write_senable,
+               NULL, vgic_uaccess_write_senable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER,
+               vgic_mmio_read_enable, vgic_mmio_write_cenable,
+              NULL, vgic_uaccess_write_cenable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
+               vgic_mmio_read_pending, vgic_mmio_write_spending,
+               vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
+               vgic_mmio_read_pending, vgic_mmio_write_cpending,
+               vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
+               vgic_mmio_read_active, vgic_mmio_write_sactive,
+               vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
+               vgic_mmio_read_active, vgic_mmio_write_cactive,
+               vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive,
+               1, VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
+               vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
+               8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR,
+               vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER,
+               vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICD_IDREGS,
+               vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
+               VGIC_ACCESS_32bit),
+};
+
+static const struct vgic_register_region vgic_v3_rd_registers[] = {
+       /* RD_base registers */
+       REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
+               vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_STATUSR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
+               vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_TYPER,
+               vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
+               vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
+               vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
+               vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
+               VGIC_ACCESS_32bit),
+       /* SGI_base registers */
+       REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0,
+               vgic_mmio_read_group, vgic_mmio_write_group, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0,
+               vgic_mmio_read_enable, vgic_mmio_write_senable,
+               NULL, vgic_uaccess_write_senable, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0,
+               vgic_mmio_read_enable, vgic_mmio_write_cenable,
+               NULL, vgic_uaccess_write_cenable, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0,
+               vgic_mmio_read_pending, vgic_mmio_write_spending,
+               vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0,
+               vgic_mmio_read_pending, vgic_mmio_write_cpending,
+               vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0,
+               vgic_mmio_read_active, vgic_mmio_write_sactive,
+               vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0,
+               vgic_mmio_read_active, vgic_mmio_write_cactive,
+               vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0,
+               vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0,
+               vgic_mmio_read_config, vgic_mmio_write_config, 8,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+};
+
+unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
+{
+       dev->regions = vgic_v3_dist_registers;
+       dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
+
+       kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+       return SZ_64K;
+}
+
+/**
+ * vgic_register_redist_iodev - register a single redist iodev
+ * @vcpu:    The VCPU to which the redistributor belongs
+ *
+ * Register a KVM iodev for this VCPU's redistributor using the address
+ * provided.
+ *
+ * Return 0 on success, -ERRNO otherwise.
+ */
+int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct vgic_dist *vgic = &kvm->arch.vgic;
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
+       struct vgic_redist_region *rdreg;
+       gpa_t rd_base;
+       int ret;
+
+       if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr))
+               return 0;
+
+       /*
+        * We may be creating VCPUs before having set the base address for the
+        * redistributor region, in which case we will come back to this
+        * function for all VCPUs when the base address is set.  Just return
+        * without doing any work for now.
+        */
+       rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions);
+       if (!rdreg)
+               return 0;
+
+       if (!vgic_v3_check_base(kvm))
+               return -EINVAL;
+
+       vgic_cpu->rdreg = rdreg;
+
+       rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
+
+       kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
+       rd_dev->base_addr = rd_base;
+       rd_dev->iodev_type = IODEV_REDIST;
+       rd_dev->regions = vgic_v3_rd_registers;
+       rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
+       rd_dev->redist_vcpu = vcpu;
+
+       mutex_lock(&kvm->slots_lock);
+       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base,
+                                     2 * SZ_64K, &rd_dev->dev);
+       mutex_unlock(&kvm->slots_lock);
+
+       if (ret)
+               return ret;
+
+       rdreg->free_index++;
+       return 0;
+}
+
+static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu)
+{
+       struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
+
+       kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev);
+}
+
+static int vgic_register_all_redist_iodevs(struct kvm *kvm)
+{
+       struct kvm_vcpu *vcpu;
+       int c, ret = 0;
+
+       kvm_for_each_vcpu(c, vcpu, kvm) {
+               ret = vgic_register_redist_iodev(vcpu);
+               if (ret)
+                       break;
+       }
+
+       if (ret) {
+               /* The current c failed, so we start with the previous one. */
+               mutex_lock(&kvm->slots_lock);
+               for (c--; c >= 0; c--) {
+                       vcpu = kvm_get_vcpu(kvm, c);
+                       vgic_unregister_redist_iodev(vcpu);
+               }
+               mutex_unlock(&kvm->slots_lock);
+       }
+
+       return ret;
+}
+
+/**
+ * vgic_v3_insert_redist_region - Insert a new redistributor region
+ *
+ * Performs various checks before inserting the rdist region in the list.
+ * Those tests depend on whether the size of the rdist region is known
+ * (ie. count != 0). The list is sorted by rdist region index.
+ *
+ * @kvm: kvm handle
+ * @index: redist region index
+ * @base: base of the new rdist region
+ * @count: number of redistributors the region is made of (0 in the old style
+ * single region, whose size is induced from the number of vcpus)
+ *
+ * Return 0 on success, < 0 otherwise
+ */
+static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
+                                       gpa_t base, uint32_t count)
+{
+       struct vgic_dist *d = &kvm->arch.vgic;
+       struct vgic_redist_region *rdreg;
+       struct list_head *rd_regions = &d->rd_regions;
+       size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
+       int ret;
+
+       /* single rdist region already set ?*/
+       if (!count && !list_empty(rd_regions))
+               return -EINVAL;
+
+       /* cross the end of memory ? */
+       if (base + size < base)
+               return -EINVAL;
+
+       if (list_empty(rd_regions)) {
+               if (index != 0)
+                       return -EINVAL;
+       } else {
+               rdreg = list_last_entry(rd_regions,
+                                       struct vgic_redist_region, list);
+               if (index != rdreg->index + 1)
+                       return -EINVAL;
+
+               /* Cannot add an explicitly sized regions after legacy region */
+               if (!rdreg->count)
+                       return -EINVAL;
+       }
+
+       /*
+        * For legacy single-region redistributor regions (!count),
+        * check that the redistributor region does not overlap with the
+        * distributor's address space.
+        */
+       if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
+               vgic_dist_overlap(kvm, base, size))
+               return -EINVAL;
+
+       /* collision with any other rdist region? */
+       if (vgic_v3_rdist_overlap(kvm, base, size))
+               return -EINVAL;
+
+       rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL);
+       if (!rdreg)
+               return -ENOMEM;
+
+       rdreg->base = VGIC_ADDR_UNDEF;
+
+       ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K);
+       if (ret)
+               goto free;
+
+       rdreg->base = base;
+       rdreg->count = count;
+       rdreg->free_index = 0;
+       rdreg->index = index;
+
+       list_add_tail(&rdreg->list, rd_regions);
+       return 0;
+free:
+       kfree(rdreg);
+       return ret;
+}
+
+int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
+{
+       int ret;
+
+       ret = vgic_v3_insert_redist_region(kvm, index, addr, count);
+       if (ret)
+               return ret;
+
+       /*
+        * Register iodevs for each existing VCPU.  Adding more VCPUs
+        * afterwards will register the iodevs when needed.
+        */
+       ret = vgic_register_all_redist_iodevs(kvm);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       const struct vgic_register_region *region;
+       struct vgic_io_device iodev;
+       struct vgic_reg_attr reg_attr;
+       struct kvm_vcpu *vcpu;
+       gpa_t addr;
+       int ret;
+
+       ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
+       if (ret)
+               return ret;
+
+       vcpu = reg_attr.vcpu;
+       addr = reg_attr.addr;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+               iodev.regions = vgic_v3_dist_registers;
+               iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
+               iodev.base_addr = 0;
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{
+               iodev.regions = vgic_v3_rd_registers;
+               iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
+               iodev.base_addr = 0;
+               break;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+               u64 reg, id;
+
+               id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
+               return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, &reg);
+       }
+       default:
+               return -ENXIO;
+       }
+
+       /* We only support aligned 32-bit accesses. */
+       if (addr & 3)
+               return -ENXIO;
+
+       region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
+       if (!region)
+               return -ENXIO;
+
+       return 0;
+}
+/*
+ * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
+ * generation register ICC_SGI1R_EL1) with a given VCPU.
+ * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
+ * return -1.
+ */
+static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
+{
+       unsigned long affinity;
+       int level0;
+
+       /*
+        * Split the current VCPU's MPIDR into affinity level 0 and the
+        * rest as this is what we have to compare against.
+        */
+       affinity = kvm_vcpu_get_mpidr_aff(vcpu);
+       level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
+       affinity &= ~MPIDR_LEVEL_MASK;
+
+       /* bail out if the upper three levels don't match */
+       if (sgi_aff != affinity)
+               return -1;
+
+       /* Is this VCPU's bit set in the mask ? */
+       if (!(sgi_cpu_mask & BIT(level0)))
+               return -1;
+
+       return level0;
+}
+
+/*
+ * The ICC_SGI* registers encode the affinity differently from the MPIDR,
+ * so provide a wrapper to use the existing defines to isolate a certain
+ * affinity level.
+ */
+#define SGI_AFFINITY_LEVEL(reg, level) \
+       ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
+       >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
+
+/**
+ * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
+ * @vcpu: The VCPU requesting a SGI
+ * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU
+ * @allow_group1: Does the sysreg access allow generation of G1 SGIs
+ *
+ * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
+ * This will trap in sys_regs.c and call this function.
+ * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
+ * target processors as well as a bitmask of 16 Aff0 CPUs.
+ * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
+ * check for matching ones. If this bit is set, we signal all, but not the
+ * calling VCPU.
+ */
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_vcpu *c_vcpu;
+       u16 target_cpus;
+       u64 mpidr;
+       int sgi, c;
+       int vcpu_id = vcpu->vcpu_id;
+       bool broadcast;
+       unsigned long flags;
+
+       sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
+       broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
+       target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
+       mpidr = SGI_AFFINITY_LEVEL(reg, 3);
+       mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
+       mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
+
+       /*
+        * We iterate over all VCPUs to find the MPIDRs matching the request.
+        * If we have handled one CPU, we clear its bit to detect early
+        * if we are already finished. This avoids iterating through all
+        * VCPUs when most of the times we just signal a single VCPU.
+        */
+       kvm_for_each_vcpu(c, c_vcpu, kvm) {
+               struct vgic_irq *irq;
+
+               /* Exit early if we have dealt with all requested CPUs */
+               if (!broadcast && target_cpus == 0)
+                       break;
+
+               /* Don't signal the calling VCPU */
+               if (broadcast && c == vcpu_id)
+                       continue;
+
+               if (!broadcast) {
+                       int level0;
+
+                       level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
+                       if (level0 == -1)
+                               continue;
+
+                       /* remove this matching VCPU from the mask */
+                       target_cpus &= ~BIT(level0);
+               }
+
+               irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+               /*
+                * An access targetting Group0 SGIs can only generate
+                * those, while an access targetting Group1 SGIs can
+                * generate interrupts of either group.
+                */
+               if (!irq->group || allow_group1) {
+                       if (!irq->hw) {
+                               irq->pending_latch = true;
+                               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+                       } else {
+                               /* HW SGI? Ask the GIC to inject it */
+                               int err;
+                               err = irq_set_irqchip_state(irq->host_irq,
+                                                           IRQCHIP_STATE_PENDING,
+                                                           true);
+                               WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+                               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+                       }
+               } else {
+                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               }
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                        int offset, u32 *val)
+{
+       struct vgic_io_device dev = {
+               .regions = vgic_v3_dist_registers,
+               .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers),
+       };
+
+       return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
+
+int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                          int offset, u32 *val)
+{
+       struct vgic_io_device rd_dev = {
+               .regions = vgic_v3_rd_registers,
+               .nr_regions = ARRAY_SIZE(vgic_v3_rd_registers),
+       };
+
+       return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val);
+}
+
+int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                                   u32 intid, u64 *val)
+{
+       if (intid % 32)
+               return -EINVAL;
+
+       if (is_write)
+               vgic_write_irq_line_level_info(vcpu, intid, *val);
+       else
+               *val = vgic_read_irq_line_level_info(vcpu, intid);
+
+       return 0;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
new file mode 100644 (file)
index 0000000..b2d73fc
--- /dev/null
@@ -0,0 +1,1088 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGIC MMIO handling functions
+ */
+
+#include <linux/bitops.h>
+#include <linux/bsearch.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_arch_timer.h>
+#include <kvm/arm_vgic.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len)
+{
+       return 0;
+}
+
+unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len)
+{
+       return -1UL;
+}
+
+void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+                       unsigned int len, unsigned long val)
+{
+       /* Ignore */
+}
+
+int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+                              unsigned int len, unsigned long val)
+{
+       /* Ignore */
+       return 0;
+}
+
+unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu,
+                                  gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 value = 0;
+       int i;
+
+       /* Loop over all IRQs affected by this read */
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               if (irq->group)
+                       value |= BIT(i);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return value;
+}
+
+static void vgic_update_vsgi(struct vgic_irq *irq)
+{
+       WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group));
+}
+
+void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
+                          unsigned int len, unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+       unsigned long flags;
+
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               irq->group = !!(val & BIT(i));
+               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+                       vgic_update_vsgi(irq);
+                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               } else {
+                       vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+               }
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+/*
+ * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value
+ * of the enabled bit, so there is only one function for both here.
+ */
+unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 value = 0;
+       int i;
+
+       /* Loop over all IRQs affected by this read */
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               if (irq->enabled)
+                       value |= (1U << i);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return value;
+}
+
+void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+       unsigned long flags;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+                       if (!irq->enabled) {
+                               struct irq_data *data;
+
+                               irq->enabled = true;
+                               data = &irq_to_desc(irq->host_irq)->irq_data;
+                               while (irqd_irq_disabled(data))
+                                       enable_irq(irq->host_irq);
+                       }
+
+                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+                       vgic_put_irq(vcpu->kvm, irq);
+
+                       continue;
+               } else if (vgic_irq_is_mapped_level(irq)) {
+                       bool was_high = irq->line_level;
+
+                       /*
+                        * We need to update the state of the interrupt because
+                        * the guest might have changed the state of the device
+                        * while the interrupt was disabled at the VGIC level.
+                        */
+                       irq->line_level = vgic_get_phys_line_level(irq);
+                       /*
+                        * Deactivate the physical interrupt so the GIC will let
+                        * us know when it is asserted again.
+                        */
+                       if (!irq->active && was_high && !irq->line_level)
+                               vgic_irq_set_phys_active(irq, false);
+               }
+               irq->enabled = true;
+               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+       unsigned long flags;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled)
+                       disable_irq_nosync(irq->host_irq);
+
+               irq->enabled = false;
+
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
+                              gpa_t addr, unsigned int len,
+                              unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+       unsigned long flags;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               irq->enabled = true;
+               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return 0;
+}
+
+int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
+                              gpa_t addr, unsigned int len,
+                              unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+       unsigned long flags;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               irq->enabled = false;
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return 0;
+}
+
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 value = 0;
+       int i;
+
+       /* Loop over all IRQs affected by this read */
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               unsigned long flags;
+               bool val;
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+                       int err;
+
+                       val = false;
+                       err = irq_get_irqchip_state(irq->host_irq,
+                                                   IRQCHIP_STATE_PENDING,
+                                                   &val);
+                       WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+               } else {
+                       val = irq_is_pending(irq);
+               }
+
+               value |= ((u32)val << i);
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return value;
+}
+
+static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
+{
+       return (vgic_irq_is_sgi(irq->intid) &&
+               vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2);
+}
+
+void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+       unsigned long flags;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               /* GICD_ISPENDR0 SGI bits are WI */
+               if (is_vgic_v2_sgi(vcpu, irq)) {
+                       vgic_put_irq(vcpu->kvm, irq);
+                       continue;
+               }
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+                       /* HW SGI? Ask the GIC to inject it */
+                       int err;
+                       err = irq_set_irqchip_state(irq->host_irq,
+                                                   IRQCHIP_STATE_PENDING,
+                                                   true);
+                       WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+
+                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+                       vgic_put_irq(vcpu->kvm, irq);
+
+                       continue;
+               }
+
+               irq->pending_latch = true;
+               if (irq->hw)
+                       vgic_irq_set_phys_active(irq, true);
+
+               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
+                               gpa_t addr, unsigned int len,
+                               unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+       unsigned long flags;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               irq->pending_latch = true;
+
+               /*
+                * GICv2 SGIs are terribly broken. We can't restore
+                * the source of the interrupt, so just pick the vcpu
+                * itself as the source...
+                */
+               if (is_vgic_v2_sgi(vcpu, irq))
+                       irq->source |= BIT(vcpu->vcpu_id);
+
+               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return 0;
+}
+
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
+{
+       irq->pending_latch = false;
+
+       /*
+        * We don't want the guest to effectively mask the physical
+        * interrupt by doing a write to SPENDR followed by a write to
+        * CPENDR for HW interrupts, so we clear the active state on
+        * the physical side if the virtual interrupt is not active.
+        * This may lead to taking an additional interrupt on the
+        * host, but that should not be a problem as the worst that
+        * can happen is an additional vgic injection.  We also clear
+        * the pending state to maintain proper semantics for edge HW
+        * interrupts.
+        */
+       vgic_irq_set_phys_pending(irq, false);
+       if (!irq->active)
+               vgic_irq_set_phys_active(irq, false);
+}
+
+void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+       unsigned long flags;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               /* GICD_ICPENDR0 SGI bits are WI */
+               if (is_vgic_v2_sgi(vcpu, irq)) {
+                       vgic_put_irq(vcpu->kvm, irq);
+                       continue;
+               }
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+                       /* HW SGI? Ask the GIC to clear its pending bit */
+                       int err;
+                       err = irq_set_irqchip_state(irq->host_irq,
+                                                   IRQCHIP_STATE_PENDING,
+                                                   false);
+                       WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+
+                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+                       vgic_put_irq(vcpu->kvm, irq);
+
+                       continue;
+               }
+
+               if (irq->hw)
+                       vgic_hw_irq_cpending(vcpu, irq);
+               else
+                       irq->pending_latch = false;
+
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
+                               gpa_t addr, unsigned int len,
+                               unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+       unsigned long flags;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               /*
+                * More fun with GICv2 SGIs! If we're clearing one of them
+                * from userspace, which source vcpu to clear? Let's not
+                * even think of it, and blow the whole set.
+                */
+               if (is_vgic_v2_sgi(vcpu, irq))
+                       irq->source = 0;
+
+               irq->pending_latch = false;
+
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return 0;
+}
+
+/*
+ * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
+ * is not queued on some running VCPU's LRs, because then the change to the
+ * active state can be overwritten when the VCPU's state is synced coming back
+ * from the guest.
+ *
+ * For shared interrupts as well as GICv3 private interrupts, we have to
+ * stop all the VCPUs because interrupts can be migrated while we don't hold
+ * the IRQ locks and we don't want to be chasing moving targets.
+ *
+ * For GICv2 private interrupts we don't have to do anything because
+ * userspace accesses to the VGIC state already require all VCPUs to be
+ * stopped, and only the VCPU itself can modify its private interrupts
+ * active state, which guarantees that the VCPU is not running.
+ */
+static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
+{
+       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+           intid >= VGIC_NR_PRIVATE_IRQS)
+               kvm_arm_halt_guest(vcpu->kvm);
+}
+
+/* See vgic_access_active_prepare */
+static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid)
+{
+       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+           intid >= VGIC_NR_PRIVATE_IRQS)
+               kvm_arm_resume_guest(vcpu->kvm);
+}
+
+static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 value = 0;
+       int i;
+
+       /* Loop over all IRQs affected by this read */
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               /*
+                * Even for HW interrupts, don't evaluate the HW state as
+                * all the guest is interested in is the virtual state.
+                */
+               if (irq->active)
+                       value |= (1U << i);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return value;
+}
+
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 val;
+
+       mutex_lock(&vcpu->kvm->lock);
+       vgic_access_active_prepare(vcpu, intid);
+
+       val = __vgic_mmio_read_active(vcpu, addr, len);
+
+       vgic_access_active_finish(vcpu, intid);
+       mutex_unlock(&vcpu->kvm->lock);
+
+       return val;
+}
+
+unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len)
+{
+       return __vgic_mmio_read_active(vcpu, addr, len);
+}
+
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+                                     bool active, bool is_uaccess)
+{
+       if (is_uaccess)
+               return;
+
+       irq->active = active;
+       vgic_irq_set_phys_active(irq, active);
+}
+
+static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+                                   bool active)
+{
+       unsigned long flags;
+       struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu();
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+       if (irq->hw && !vgic_irq_is_sgi(irq->intid)) {
+               vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
+       } else if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+               /*
+                * GICv4.1 VSGI feature doesn't track an active state,
+                * so let's not kid ourselves, there is nothing we can
+                * do here.
+                */
+               irq->active = false;
+       } else {
+               u32 model = vcpu->kvm->arch.vgic.vgic_model;
+               u8 active_source;
+
+               irq->active = active;
+
+               /*
+                * The GICv2 architecture indicates that the source CPUID for
+                * an SGI should be provided during an EOI which implies that
+                * the active state is stored somewhere, but at the same time
+                * this state is not architecturally exposed anywhere and we
+                * have no way of knowing the right source.
+                *
+                * This may lead to a VCPU not being able to receive
+                * additional instances of a particular SGI after migration
+                * for a GICv2 VM on some GIC implementations.  Oh well.
+                */
+               active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0;
+
+               if (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+                   active && vgic_irq_is_sgi(irq->intid))
+                       irq->active_source = active_source;
+       }
+
+       if (irq->active)
+               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+       else
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+}
+
+static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+                                     gpa_t addr, unsigned int len,
+                                     unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               vgic_mmio_change_active(vcpu, irq, false);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+
+       mutex_lock(&vcpu->kvm->lock);
+       vgic_access_active_prepare(vcpu, intid);
+
+       __vgic_mmio_write_cactive(vcpu, addr, len, val);
+
+       vgic_access_active_finish(vcpu, intid);
+       mutex_unlock(&vcpu->kvm->lock);
+}
+
+int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       __vgic_mmio_write_cactive(vcpu, addr, len, val);
+       return 0;
+}
+
+static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+                                     gpa_t addr, unsigned int len,
+                                     unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               vgic_mmio_change_active(vcpu, irq, true);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+
+       mutex_lock(&vcpu->kvm->lock);
+       vgic_access_active_prepare(vcpu, intid);
+
+       __vgic_mmio_write_sactive(vcpu, addr, len, val);
+
+       vgic_access_active_finish(vcpu, intid);
+       mutex_unlock(&vcpu->kvm->lock);
+}
+
+int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       __vgic_mmio_write_sactive(vcpu, addr, len, val);
+       return 0;
+}
+
+unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
+                                     gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       int i;
+       u64 val = 0;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               val |= (u64)irq->priority << (i * 8);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return val;
+}
+
+/*
+ * We currently don't handle changing the priority of an interrupt that
+ * is already pending on a VCPU. If there is a need for this, we would
+ * need to make this VCPU exit and re-evaluate the priorities, potentially
+ * leading to this interrupt getting presented now to the guest (if it has
+ * been masked by the priority mask before).
+ */
+void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       int i;
+       unsigned long flags;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               /* Narrow the priority range to what we actually support */
+               irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
+               if (irq->hw && vgic_irq_is_sgi(irq->intid))
+                       vgic_update_vsgi(irq);
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
+       u32 value = 0;
+       int i;
+
+       for (i = 0; i < len * 4; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               if (irq->config == VGIC_CONFIG_EDGE)
+                       value |= (2U << (i * 2));
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return value;
+}
+
+void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
+                           gpa_t addr, unsigned int len,
+                           unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
+       int i;
+       unsigned long flags;
+
+       for (i = 0; i < len * 4; i++) {
+               struct vgic_irq *irq;
+
+               /*
+                * The configuration cannot be changed for SGIs in general,
+                * for PPIs this is IMPLEMENTATION DEFINED. The arch timer
+                * code relies on PPIs being level triggered, so we also
+                * make them read-only here.
+                */
+               if (intid + i < VGIC_NR_PRIVATE_IRQS)
+                       continue;
+
+               irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+               if (test_bit(i * 2 + 1, &val))
+                       irq->config = VGIC_CONFIG_EDGE;
+               else
+                       irq->config = VGIC_CONFIG_LEVEL;
+
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid)
+{
+       int i;
+       u64 val = 0;
+       int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+
+       for (i = 0; i < 32; i++) {
+               struct vgic_irq *irq;
+
+               if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
+                       continue;
+
+               irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level)
+                       val |= (1U << i);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       return val;
+}
+
+void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
+                                   const u64 val)
+{
+       int i;
+       int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+       unsigned long flags;
+
+       for (i = 0; i < 32; i++) {
+               struct vgic_irq *irq;
+               bool new_level;
+
+               if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
+                       continue;
+
+               irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               /*
+                * Line level is set irrespective of irq type
+                * (level or edge) to avoid dependency that VM should
+                * restore irq config before line level.
+                */
+               new_level = !!(val & (1U << i));
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               irq->line_level = new_level;
+               if (new_level)
+                       vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+               else
+                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+static int match_region(const void *key, const void *elt)
+{
+       const unsigned int offset = (unsigned long)key;
+       const struct vgic_register_region *region = elt;
+
+       if (offset < region->reg_offset)
+               return -1;
+
+       if (offset >= region->reg_offset + region->len)
+               return 1;
+
+       return 0;
+}
+
+const struct vgic_register_region *
+vgic_find_mmio_region(const struct vgic_register_region *regions,
+                     int nr_regions, unsigned int offset)
+{
+       return bsearch((void *)(uintptr_t)offset, regions, nr_regions,
+                      sizeof(regions[0]), match_region);
+}
+
+void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_set_vmcr(vcpu, vmcr);
+       else
+               vgic_v3_set_vmcr(vcpu, vmcr);
+}
+
+void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_get_vmcr(vcpu, vmcr);
+       else
+               vgic_v3_get_vmcr(vcpu, vmcr);
+}
+
+/*
+ * kvm_mmio_read_buf() returns a value in a format where it can be converted
+ * to a byte array and be directly observed as the guest wanted it to appear
+ * in memory if it had done the store itself, which is LE for the GIC, as the
+ * guest knows the GIC is always LE.
+ *
+ * We convert this value to the CPUs native format to deal with it as a data
+ * value.
+ */
+unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len)
+{
+       unsigned long data = kvm_mmio_read_buf(val, len);
+
+       switch (len) {
+       case 1:
+               return data;
+       case 2:
+               return le16_to_cpu(data);
+       case 4:
+               return le32_to_cpu(data);
+       default:
+               return le64_to_cpu(data);
+       }
+}
+
+/*
+ * kvm_mmio_write_buf() expects a value in a format such that if converted to
+ * a byte array it is observed as the guest would see it if it could perform
+ * the load directly.  Since the GIC is LE, and the guest knows this, the
+ * guest expects a value in little endian format.
+ *
+ * We convert the data value from the CPUs native format to LE so that the
+ * value is returned in the proper format.
+ */
+void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
+                               unsigned long data)
+{
+       switch (len) {
+       case 1:
+               break;
+       case 2:
+               data = cpu_to_le16(data);
+               break;
+       case 4:
+               data = cpu_to_le32(data);
+               break;
+       default:
+               data = cpu_to_le64(data);
+       }
+
+       kvm_mmio_write_buf(buf, len, data);
+}
+
+static
+struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev)
+{
+       return container_of(dev, struct vgic_io_device, dev);
+}
+
+static bool check_region(const struct kvm *kvm,
+                        const struct vgic_register_region *region,
+                        gpa_t addr, int len)
+{
+       int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+
+       switch (len) {
+       case sizeof(u8):
+               flags = VGIC_ACCESS_8bit;
+               break;
+       case sizeof(u32):
+               flags = VGIC_ACCESS_32bit;
+               break;
+       case sizeof(u64):
+               flags = VGIC_ACCESS_64bit;
+               break;
+       default:
+               return false;
+       }
+
+       if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) {
+               if (!region->bits_per_irq)
+                       return true;
+
+               /* Do we access a non-allocated IRQ? */
+               return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs;
+       }
+
+       return false;
+}
+
+const struct vgic_register_region *
+vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
+                    gpa_t addr, int len)
+{
+       const struct vgic_register_region *region;
+
+       region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
+                                      addr - iodev->base_addr);
+       if (!region || !check_region(vcpu->kvm, region, addr, len))
+               return NULL;
+
+       return region;
+}
+
+static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+                            gpa_t addr, u32 *val)
+{
+       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+       const struct vgic_register_region *region;
+       struct kvm_vcpu *r_vcpu;
+
+       region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
+       if (!region) {
+               *val = 0;
+               return 0;
+       }
+
+       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
+       if (region->uaccess_read)
+               *val = region->uaccess_read(r_vcpu, addr, sizeof(u32));
+       else
+               *val = region->read(r_vcpu, addr, sizeof(u32));
+
+       return 0;
+}
+
+static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+                             gpa_t addr, const u32 *val)
+{
+       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+       const struct vgic_register_region *region;
+       struct kvm_vcpu *r_vcpu;
+
+       region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
+       if (!region)
+               return 0;
+
+       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
+       if (region->uaccess_write)
+               return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val);
+
+       region->write(r_vcpu, addr, sizeof(u32), *val);
+       return 0;
+}
+
+/*
+ * Userland access to VGIC registers.
+ */
+int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
+                bool is_write, int offset, u32 *val)
+{
+       if (is_write)
+               return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
+       else
+               return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
+}
+
+static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+                             gpa_t addr, int len, void *val)
+{
+       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+       const struct vgic_register_region *region;
+       unsigned long data = 0;
+
+       region = vgic_get_mmio_region(vcpu, iodev, addr, len);
+       if (!region) {
+               memset(val, 0, len);
+               return 0;
+       }
+
+       switch (iodev->iodev_type) {
+       case IODEV_CPUIF:
+               data = region->read(vcpu, addr, len);
+               break;
+       case IODEV_DIST:
+               data = region->read(vcpu, addr, len);
+               break;
+       case IODEV_REDIST:
+               data = region->read(iodev->redist_vcpu, addr, len);
+               break;
+       case IODEV_ITS:
+               data = region->its_read(vcpu->kvm, iodev->its, addr, len);
+               break;
+       }
+
+       vgic_data_host_to_mmio_bus(val, len, data);
+       return 0;
+}
+
+static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+                              gpa_t addr, int len, const void *val)
+{
+       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+       const struct vgic_register_region *region;
+       unsigned long data = vgic_data_mmio_bus_to_host(val, len);
+
+       region = vgic_get_mmio_region(vcpu, iodev, addr, len);
+       if (!region)
+               return 0;
+
+       switch (iodev->iodev_type) {
+       case IODEV_CPUIF:
+               region->write(vcpu, addr, len, data);
+               break;
+       case IODEV_DIST:
+               region->write(vcpu, addr, len, data);
+               break;
+       case IODEV_REDIST:
+               region->write(iodev->redist_vcpu, addr, len, data);
+               break;
+       case IODEV_ITS:
+               region->its_write(vcpu->kvm, iodev->its, addr, len, data);
+               break;
+       }
+
+       return 0;
+}
+
+struct kvm_io_device_ops kvm_io_gic_ops = {
+       .read = dispatch_mmio_read,
+       .write = dispatch_mmio_write,
+};
+
+int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
+                            enum vgic_type type)
+{
+       struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev;
+       int ret = 0;
+       unsigned int len;
+
+       switch (type) {
+       case VGIC_V2:
+               len = vgic_v2_init_dist_iodev(io_device);
+               break;
+       case VGIC_V3:
+               len = vgic_v3_init_dist_iodev(io_device);
+               break;
+       default:
+               BUG_ON(1);
+       }
+
+       io_device->base_addr = dist_base_address;
+       io_device->iodev_type = IODEV_DIST;
+       io_device->redist_vcpu = NULL;
+
+       mutex_lock(&kvm->slots_lock);
+       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address,
+                                     len, &io_device->dev);
+       mutex_unlock(&kvm->slots_lock);
+
+       return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h
new file mode 100644 (file)
index 0000000..fefcca2
--- /dev/null
@@ -0,0 +1,227 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+#ifndef __KVM_ARM_VGIC_MMIO_H__
+#define __KVM_ARM_VGIC_MMIO_H__
+
+struct vgic_register_region {
+       unsigned int reg_offset;
+       unsigned int len;
+       unsigned int bits_per_irq;
+       unsigned int access_flags;
+       union {
+               unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
+                                     unsigned int len);
+               unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its,
+                                         gpa_t addr, unsigned int len);
+       };
+       union {
+               void (*write)(struct kvm_vcpu *vcpu, gpa_t addr,
+                             unsigned int len, unsigned long val);
+               void (*its_write)(struct kvm *kvm, struct vgic_its *its,
+                                 gpa_t addr, unsigned int len,
+                                 unsigned long val);
+       };
+       unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr,
+                                     unsigned int len);
+       union {
+               int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr,
+                                    unsigned int len, unsigned long val);
+               int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its,
+                                        gpa_t addr, unsigned int len,
+                                        unsigned long val);
+       };
+};
+
+extern struct kvm_io_device_ops kvm_io_gic_ops;
+
+#define VGIC_ACCESS_8bit       1
+#define VGIC_ACCESS_32bit      2
+#define VGIC_ACCESS_64bit      4
+
+/*
+ * Generate a mask that covers the number of bytes required to address
+ * up to 1024 interrupts, each represented by <bits> bits. This assumes
+ * that <bits> is a power of two.
+ */
+#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1)
+
+/*
+ * (addr & mask) gives us the _byte_ offset for the INT ID.
+ * We multiply this by 8 the get the _bit_ offset, then divide this by
+ * the number of bits to learn the actual INT ID.
+ * But instead of a division (which requires a "long long div" implementation),
+ * we shift by the binary logarithm of <bits>.
+ * This assumes that <bits> is a power of two.
+ */
+#define VGIC_ADDR_TO_INTID(addr, bits)  (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \
+                                       8 >> ilog2(bits))
+
+/*
+ * Some VGIC registers store per-IRQ information, with a different number
+ * of bits per IRQ. For those registers this macro is used.
+ * The _WITH_LENGTH version instantiates registers with a fixed length
+ * and is mutually exclusive with the _PER_IRQ version.
+ */
+#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc) \
+       {                                                               \
+               .reg_offset = off,                                      \
+               .bits_per_irq = bpi,                                    \
+               .len = bpi * 1024 / 8,                                  \
+               .access_flags = acc,                                    \
+               .read = rd,                                             \
+               .write = wr,                                            \
+               .uaccess_read = ur,                                     \
+               .uaccess_write = uw,                                    \
+       }
+
+#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc)            \
+       {                                                               \
+               .reg_offset = off,                                      \
+               .bits_per_irq = 0,                                      \
+               .len = length,                                          \
+               .access_flags = acc,                                    \
+               .read = rd,                                             \
+               .write = wr,                                            \
+       }
+
+#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \
+       {                                                               \
+               .reg_offset = off,                                      \
+               .bits_per_irq = 0,                                      \
+               .len = length,                                          \
+               .access_flags = acc,                                    \
+               .read = rd,                                             \
+               .write = wr,                                            \
+               .uaccess_read = urd,                                    \
+               .uaccess_write = uwr,                                   \
+       }
+
+unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
+
+void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
+                               unsigned long data);
+
+unsigned long extract_bytes(u64 data, unsigned int offset,
+                           unsigned int num);
+
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+                    unsigned long val);
+
+unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len);
+
+unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+                       unsigned int len, unsigned long val);
+
+int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+                              unsigned int len, unsigned long val);
+
+unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr,
+                                  unsigned int len);
+
+void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
+                          unsigned int len, unsigned long val);
+
+unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
+                              gpa_t addr, unsigned int len,
+                              unsigned long val);
+
+int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
+                              gpa_t addr, unsigned int len,
+                              unsigned long val);
+
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val);
+
+void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val);
+
+int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
+                               gpa_t addr, unsigned int len,
+                               unsigned long val);
+
+int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
+                               gpa_t addr, unsigned int len,
+                               unsigned long val);
+
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len);
+
+unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   unsigned long val);
+
+int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   unsigned long val);
+
+unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
+                                     gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val);
+
+unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
+                           gpa_t addr, unsigned int len,
+                           unsigned long val);
+
+int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
+                bool is_write, int offset, u32 *val);
+
+u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid);
+
+void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
+                                   const u64 val);
+
+unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
+
+unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
+
+u64 vgic_sanitise_outer_cacheability(u64 reg);
+u64 vgic_sanitise_inner_cacheability(u64 reg);
+u64 vgic_sanitise_shareability(u64 reg);
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+                       u64 (*sanitise_fn)(u64));
+
+/* Find the proper register handler entry given a certain address offset */
+const struct vgic_register_region *
+vgic_find_mmio_region(const struct vgic_register_region *regions,
+                     int nr_regions, unsigned int offset);
+
+#endif
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
new file mode 100644 (file)
index 0000000..621cc16
--- /dev/null
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+
+static inline void vgic_v2_write_lr(int lr, u32 val)
+{
+       void __iomem *base = kvm_vgic_global_state.vctrl_base;
+
+       writel_relaxed(val, base + GICH_LR0 + (lr * 4));
+}
+
+void vgic_v2_init_lrs(void)
+{
+       int i;
+
+       for (i = 0; i < kvm_vgic_global_state.nr_lr; i++)
+               vgic_v2_write_lr(i, 0);
+}
+
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+       cpuif->vgic_hcr |= GICH_HCR_UIE;
+}
+
+static bool lr_signals_eoi_mi(u32 lr_val)
+{
+       return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) &&
+              !(lr_val & GICH_LR_HW);
+}
+
+/*
+ * transfer the content of the LRs back into the corresponding ap_list:
+ * - active bit is transferred as is
+ * - pending bit is
+ *   - transferred as is in case of edge sensitive IRQs
+ *   - set to the line-level (resample time) for level sensitive IRQs
+ */
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
+       int lr;
+
+       DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+       cpuif->vgic_hcr &= ~GICH_HCR_UIE;
+
+       for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
+               u32 val = cpuif->vgic_lr[lr];
+               u32 cpuid, intid = val & GICH_LR_VIRTUALID;
+               struct vgic_irq *irq;
+
+               /* Extract the source vCPU id from the LR */
+               cpuid = val & GICH_LR_PHYSID_CPUID;
+               cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+               cpuid &= 7;
+
+               /* Notify fds when the guest EOI'ed a level-triggered SPI */
+               if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
+                       kvm_notify_acked_irq(vcpu->kvm, 0,
+                                            intid - VGIC_NR_PRIVATE_IRQS);
+
+               irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+
+               raw_spin_lock(&irq->irq_lock);
+
+               /* Always preserve the active bit */
+               irq->active = !!(val & GICH_LR_ACTIVE_BIT);
+
+               if (irq->active && vgic_irq_is_sgi(intid))
+                       irq->active_source = cpuid;
+
+               /* Edge is the only case where we preserve the pending bit */
+               if (irq->config == VGIC_CONFIG_EDGE &&
+                   (val & GICH_LR_PENDING_BIT)) {
+                       irq->pending_latch = true;
+
+                       if (vgic_irq_is_sgi(intid))
+                               irq->source |= (1 << cpuid);
+               }
+
+               /*
+                * Clear soft pending state when level irqs have been acked.
+                */
+               if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
+                       irq->pending_latch = false;
+
+               /*
+                * Level-triggered mapped IRQs are special because we only
+                * observe rising edges as input to the VGIC.
+                *
+                * If the guest never acked the interrupt we have to sample
+                * the physical line and set the line level, because the
+                * device state could have changed or we simply need to
+                * process the still pending interrupt later.
+                *
+                * If this causes us to lower the level, we have to also clear
+                * the physical active state, since we will otherwise never be
+                * told when the interrupt becomes asserted again.
+                */
+               if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) {
+                       irq->line_level = vgic_get_phys_line_level(irq);
+
+                       if (!irq->line_level)
+                               vgic_irq_set_phys_active(irq, false);
+               }
+
+               raw_spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       vgic_cpu->used_lrs = 0;
+}
+
+/*
+ * Populates the particular LR with the state of a given IRQ:
+ * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
+ * - for a level sensitive IRQ the pending state value is unchanged;
+ *   it is dictated directly by the input level
+ *
+ * If @irq describes an SGI with multiple sources, we choose the
+ * lowest-numbered source VCPU and clear that bit in the source bitmap.
+ *
+ * The irq_lock must be held by the caller.
+ */
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+       u32 val = irq->intid;
+       bool allow_pending = true;
+
+       if (irq->active) {
+               val |= GICH_LR_ACTIVE_BIT;
+               if (vgic_irq_is_sgi(irq->intid))
+                       val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
+               if (vgic_irq_is_multi_sgi(irq)) {
+                       allow_pending = false;
+                       val |= GICH_LR_EOI;
+               }
+       }
+
+       if (irq->group)
+               val |= GICH_LR_GROUP1;
+
+       if (irq->hw) {
+               val |= GICH_LR_HW;
+               val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
+               /*
+                * Never set pending+active on a HW interrupt, as the
+                * pending state is kept at the physical distributor
+                * level.
+                */
+               if (irq->active)
+                       allow_pending = false;
+       } else {
+               if (irq->config == VGIC_CONFIG_LEVEL) {
+                       val |= GICH_LR_EOI;
+
+                       /*
+                        * Software resampling doesn't work very well
+                        * if we allow P+A, so let's not do that.
+                        */
+                       if (irq->active)
+                               allow_pending = false;
+               }
+       }
+
+       if (allow_pending && irq_is_pending(irq)) {
+               val |= GICH_LR_PENDING_BIT;
+
+               if (irq->config == VGIC_CONFIG_EDGE)
+                       irq->pending_latch = false;
+
+               if (vgic_irq_is_sgi(irq->intid)) {
+                       u32 src = ffs(irq->source);
+
+                       if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
+                                          irq->intid))
+                               return;
+
+                       val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+                       irq->source &= ~(1 << (src - 1));
+                       if (irq->source) {
+                               irq->pending_latch = true;
+                               val |= GICH_LR_EOI;
+                       }
+               }
+       }
+
+       /*
+        * Level-triggered mapped IRQs are special because we only observe
+        * rising edges as input to the VGIC.  We therefore lower the line
+        * level here, so that we can take new virtual IRQs.  See
+        * vgic_v2_fold_lr_state for more info.
+        */
+       if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT))
+               irq->line_level = false;
+
+       /* The GICv2 LR only holds five bits of priority. */
+       val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
+
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+}
+
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0;
+}
+
+void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+       u32 vmcr;
+
+       vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) &
+               GICH_VMCR_ENABLE_GRP0_MASK;
+       vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) &
+               GICH_VMCR_ENABLE_GRP1_MASK;
+       vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) &
+               GICH_VMCR_ACK_CTL_MASK;
+       vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) &
+               GICH_VMCR_FIQ_EN_MASK;
+       vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) &
+               GICH_VMCR_CBPR_MASK;
+       vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) &
+               GICH_VMCR_EOI_MODE_MASK;
+       vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) &
+               GICH_VMCR_ALIAS_BINPOINT_MASK;
+       vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
+               GICH_VMCR_BINPOINT_MASK;
+       vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) <<
+                GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
+
+       cpu_if->vgic_vmcr = vmcr;
+}
+
+void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+       u32 vmcr;
+
+       vmcr = cpu_if->vgic_vmcr;
+
+       vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >>
+               GICH_VMCR_ENABLE_GRP0_SHIFT;
+       vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >>
+               GICH_VMCR_ENABLE_GRP1_SHIFT;
+       vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >>
+               GICH_VMCR_ACK_CTL_SHIFT;
+       vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >>
+               GICH_VMCR_FIQ_EN_SHIFT;
+       vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >>
+               GICH_VMCR_CBPR_SHIFT;
+       vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >>
+               GICH_VMCR_EOI_MODE_SHIFT;
+
+       vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >>
+                       GICH_VMCR_ALIAS_BINPOINT_SHIFT;
+       vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
+                       GICH_VMCR_BINPOINT_SHIFT;
+       vmcrp->pmr  = ((vmcr & GICH_VMCR_PRIMASK_MASK) >>
+                       GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
+}
+
+void vgic_v2_enable(struct kvm_vcpu *vcpu)
+{
+       /*
+        * By forcing VMCR to zero, the GIC will restore the binary
+        * points to their reset values. Anything else resets to zero
+        * anyway.
+        */
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
+
+       /* Get the show on the road... */
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
+}
+
+/* check for overlapping regions and for regions crossing the end of memory */
+static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
+{
+       if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base)
+               return false;
+       if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base)
+               return false;
+
+       if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base)
+               return true;
+       if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base)
+               return true;
+
+       return false;
+}
+
+int vgic_v2_map_resources(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       int ret = 0;
+
+       if (vgic_ready(kvm))
+               goto out;
+
+       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+           IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
+               kvm_err("Need to set vgic cpu and dist addresses first\n");
+               ret = -ENXIO;
+               goto out;
+       }
+
+       if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) {
+               kvm_err("VGIC CPU and dist frames overlap\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * Initialize the vgic if this hasn't already been done on demand by
+        * accessing the vgic state from userspace.
+        */
+       ret = vgic_init(kvm);
+       if (ret) {
+               kvm_err("Unable to initialize VGIC dynamic data structures\n");
+               goto out;
+       }
+
+       ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2);
+       if (ret) {
+               kvm_err("Unable to register VGIC MMIO regions\n");
+               goto out;
+       }
+
+       if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
+               ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
+                                           kvm_vgic_global_state.vcpu_base,
+                                           KVM_VGIC_V2_CPU_SIZE, true);
+               if (ret) {
+                       kvm_err("Unable to remap VGIC CPU to VCPU\n");
+                       goto out;
+               }
+       }
+
+       dist->ready = true;
+
+out:
+       return ret;
+}
+
+DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap);
+
+/**
+ * vgic_v2_probe - probe for a VGICv2 compatible interrupt controller
+ * @info:      pointer to the GIC description
+ *
+ * Returns 0 if the VGICv2 has been probed successfully, returns an error code
+ * otherwise
+ */
+int vgic_v2_probe(const struct gic_kvm_info *info)
+{
+       int ret;
+       u32 vtr;
+
+       if (!info->vctrl.start) {
+               kvm_err("GICH not present in the firmware table\n");
+               return -ENXIO;
+       }
+
+       if (!PAGE_ALIGNED(info->vcpu.start) ||
+           !PAGE_ALIGNED(resource_size(&info->vcpu))) {
+               kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n");
+
+               ret = create_hyp_io_mappings(info->vcpu.start,
+                                            resource_size(&info->vcpu),
+                                            &kvm_vgic_global_state.vcpu_base_va,
+                                            &kvm_vgic_global_state.vcpu_hyp_va);
+               if (ret) {
+                       kvm_err("Cannot map GICV into hyp\n");
+                       goto out;
+               }
+
+               static_branch_enable(&vgic_v2_cpuif_trap);
+       }
+
+       ret = create_hyp_io_mappings(info->vctrl.start,
+                                    resource_size(&info->vctrl),
+                                    &kvm_vgic_global_state.vctrl_base,
+                                    &kvm_vgic_global_state.vctrl_hyp);
+       if (ret) {
+               kvm_err("Cannot map VCTRL into hyp\n");
+               goto out;
+       }
+
+       vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
+       kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
+
+       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+       if (ret) {
+               kvm_err("Cannot register GICv2 KVM device\n");
+               goto out;
+       }
+
+       kvm_vgic_global_state.can_emulate_gicv2 = true;
+       kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+       kvm_vgic_global_state.type = VGIC_V2;
+       kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
+
+       kvm_debug("vgic-v2@%llx\n", info->vctrl.start);
+
+       return 0;
+out:
+       if (kvm_vgic_global_state.vctrl_base)
+               iounmap(kvm_vgic_global_state.vctrl_base);
+       if (kvm_vgic_global_state.vcpu_base_va)
+               iounmap(kvm_vgic_global_state.vcpu_base_va);
+
+       return ret;
+}
+
+static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
+{
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+       u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+       u64 elrsr;
+       int i;
+
+       elrsr = readl_relaxed(base + GICH_ELRSR0);
+       if (unlikely(used_lrs > 32))
+               elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32;
+
+       for (i = 0; i < used_lrs; i++) {
+               if (elrsr & (1UL << i))
+                       cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
+               else
+                       cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
+
+               writel_relaxed(0, base + GICH_LR0 + (i * 4));
+       }
+}
+
+void vgic_v2_save_state(struct kvm_vcpu *vcpu)
+{
+       void __iomem *base = kvm_vgic_global_state.vctrl_base;
+       u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+
+       if (!base)
+               return;
+
+       if (used_lrs) {
+               save_lrs(vcpu, base);
+               writel_relaxed(0, base + GICH_HCR);
+       }
+}
+
+void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+       void __iomem *base = kvm_vgic_global_state.vctrl_base;
+       u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+       int i;
+
+       if (!base)
+               return;
+
+       if (used_lrs) {
+               writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
+               for (i = 0; i < used_lrs; i++) {
+                       writel_relaxed(cpu_if->vgic_lr[i],
+                                      base + GICH_LR0 + (i * 4));
+               }
+       }
+}
+
+void vgic_v2_load(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+
+       writel_relaxed(cpu_if->vgic_vmcr,
+                      kvm_vgic_global_state.vctrl_base + GICH_VMCR);
+       writel_relaxed(cpu_if->vgic_apr,
+                      kvm_vgic_global_state.vctrl_base + GICH_APR);
+}
+
+void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+
+       cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
+}
+
+void vgic_v2_put(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+
+       vgic_v2_vmcr_sync(vcpu);
+       cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
new file mode 100644 (file)
index 0000000..5bc2ab5
--- /dev/null
@@ -0,0 +1,691 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_asm.h>
+
+#include "vgic.h"
+
+static bool group0_trap;
+static bool group1_trap;
+static bool common_trap;
+static bool gicv4_enable;
+
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       cpuif->vgic_hcr |= ICH_HCR_UIE;
+}
+
+static bool lr_signals_eoi_mi(u64 lr_val)
+{
+       return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) &&
+              !(lr_val & ICH_LR_HW);
+}
+
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
+       u32 model = vcpu->kvm->arch.vgic.vgic_model;
+       int lr;
+
+       DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+       cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+
+       for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
+               u64 val = cpuif->vgic_lr[lr];
+               u32 intid, cpuid;
+               struct vgic_irq *irq;
+               bool is_v2_sgi = false;
+
+               cpuid = val & GICH_LR_PHYSID_CPUID;
+               cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+
+               if (model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+                       intid = val & ICH_LR_VIRTUAL_ID_MASK;
+               } else {
+                       intid = val & GICH_LR_VIRTUALID;
+                       is_v2_sgi = vgic_irq_is_sgi(intid);
+               }
+
+               /* Notify fds when the guest EOI'ed a level-triggered IRQ */
+               if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
+                       kvm_notify_acked_irq(vcpu->kvm, 0,
+                                            intid - VGIC_NR_PRIVATE_IRQS);
+
+               irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+               if (!irq)       /* An LPI could have been unmapped. */
+                       continue;
+
+               raw_spin_lock(&irq->irq_lock);
+
+               /* Always preserve the active bit */
+               irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+
+               if (irq->active && is_v2_sgi)
+                       irq->active_source = cpuid;
+
+               /* Edge is the only case where we preserve the pending bit */
+               if (irq->config == VGIC_CONFIG_EDGE &&
+                   (val & ICH_LR_PENDING_BIT)) {
+                       irq->pending_latch = true;
+
+                       if (is_v2_sgi)
+                               irq->source |= (1 << cpuid);
+               }
+
+               /*
+                * Clear soft pending state when level irqs have been acked.
+                */
+               if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
+                       irq->pending_latch = false;
+
+               /*
+                * Level-triggered mapped IRQs are special because we only
+                * observe rising edges as input to the VGIC.
+                *
+                * If the guest never acked the interrupt we have to sample
+                * the physical line and set the line level, because the
+                * device state could have changed or we simply need to
+                * process the still pending interrupt later.
+                *
+                * If this causes us to lower the level, we have to also clear
+                * the physical active state, since we will otherwise never be
+                * told when the interrupt becomes asserted again.
+                */
+               if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) {
+                       irq->line_level = vgic_get_phys_line_level(irq);
+
+                       if (!irq->line_level)
+                               vgic_irq_set_phys_active(irq, false);
+               }
+
+               raw_spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       vgic_cpu->used_lrs = 0;
+}
+
+/* Requires the irq to be locked already */
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+       u32 model = vcpu->kvm->arch.vgic.vgic_model;
+       u64 val = irq->intid;
+       bool allow_pending = true, is_v2_sgi;
+
+       is_v2_sgi = (vgic_irq_is_sgi(irq->intid) &&
+                    model == KVM_DEV_TYPE_ARM_VGIC_V2);
+
+       if (irq->active) {
+               val |= ICH_LR_ACTIVE_BIT;
+               if (is_v2_sgi)
+                       val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
+               if (vgic_irq_is_multi_sgi(irq)) {
+                       allow_pending = false;
+                       val |= ICH_LR_EOI;
+               }
+       }
+
+       if (irq->hw) {
+               val |= ICH_LR_HW;
+               val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
+               /*
+                * Never set pending+active on a HW interrupt, as the
+                * pending state is kept at the physical distributor
+                * level.
+                */
+               if (irq->active)
+                       allow_pending = false;
+       } else {
+               if (irq->config == VGIC_CONFIG_LEVEL) {
+                       val |= ICH_LR_EOI;
+
+                       /*
+                        * Software resampling doesn't work very well
+                        * if we allow P+A, so let's not do that.
+                        */
+                       if (irq->active)
+                               allow_pending = false;
+               }
+       }
+
+       if (allow_pending && irq_is_pending(irq)) {
+               val |= ICH_LR_PENDING_BIT;
+
+               if (irq->config == VGIC_CONFIG_EDGE)
+                       irq->pending_latch = false;
+
+               if (vgic_irq_is_sgi(irq->intid) &&
+                   model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+                       u32 src = ffs(irq->source);
+
+                       if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
+                                          irq->intid))
+                               return;
+
+                       val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+                       irq->source &= ~(1 << (src - 1));
+                       if (irq->source) {
+                               irq->pending_latch = true;
+                               val |= ICH_LR_EOI;
+                       }
+               }
+       }
+
+       /*
+        * Level-triggered mapped IRQs are special because we only observe
+        * rising edges as input to the VGIC.  We therefore lower the line
+        * level here, so that we can take new virtual IRQs.  See
+        * vgic_v3_fold_lr_state for more info.
+        */
+       if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
+               irq->line_level = false;
+
+       if (irq->group)
+               val |= ICH_LR_GROUP;
+
+       val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
+
+       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+}
+
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0;
+}
+
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u32 model = vcpu->kvm->arch.vgic.vgic_model;
+       u32 vmcr;
+
+       if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+               vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) &
+                       ICH_VMCR_ACK_CTL_MASK;
+               vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) &
+                       ICH_VMCR_FIQ_EN_MASK;
+       } else {
+               /*
+                * When emulating GICv3 on GICv3 with SRE=1 on the
+                * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
+                */
+               vmcr = ICH_VMCR_FIQ_EN_MASK;
+       }
+
+       vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK;
+       vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
+       vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
+       vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
+       vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
+       vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK;
+       vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK;
+
+       cpu_if->vgic_vmcr = vmcr;
+}
+
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u32 model = vcpu->kvm->arch.vgic.vgic_model;
+       u32 vmcr;
+
+       vmcr = cpu_if->vgic_vmcr;
+
+       if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+               vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >>
+                       ICH_VMCR_ACK_CTL_SHIFT;
+               vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >>
+                       ICH_VMCR_FIQ_EN_SHIFT;
+       } else {
+               /*
+                * When emulating GICv3 on GICv3 with SRE=1 on the
+                * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
+                */
+               vmcrp->fiqen = 1;
+               vmcrp->ackctl = 0;
+       }
+
+       vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
+       vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT;
+       vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
+       vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
+       vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
+       vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT;
+       vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT;
+}
+
+#define INITIAL_PENDBASER_VALUE                                                  \
+       (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb)            | \
+       GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner)      | \
+       GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
+
+void vgic_v3_enable(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       /*
+        * By forcing VMCR to zero, the GIC will restore the binary
+        * points to their reset values. Anything else resets to zero
+        * anyway.
+        */
+       vgic_v3->vgic_vmcr = 0;
+
+       /*
+        * If we are emulating a GICv3, we do it in an non-GICv2-compatible
+        * way, so we force SRE to 1 to demonstrate this to the guest.
+        * Also, we don't support any form of IRQ/FIQ bypass.
+        * This goes with the spec allowing the value to be RAO/WI.
+        */
+       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+               vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB |
+                                    ICC_SRE_EL1_DFB |
+                                    ICC_SRE_EL1_SRE);
+               vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
+       } else {
+               vgic_v3->vgic_sre = 0;
+       }
+
+       vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 &
+                                          ICH_VTR_ID_BITS_MASK) >>
+                                          ICH_VTR_ID_BITS_SHIFT;
+       vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 &
+                                           ICH_VTR_PRI_BITS_MASK) >>
+                                           ICH_VTR_PRI_BITS_SHIFT) + 1;
+
+       /* Get the show on the road... */
+       vgic_v3->vgic_hcr = ICH_HCR_EN;
+       if (group0_trap)
+               vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
+       if (group1_trap)
+               vgic_v3->vgic_hcr |= ICH_HCR_TALL1;
+       if (common_trap)
+               vgic_v3->vgic_hcr |= ICH_HCR_TC;
+}
+
+int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
+{
+       struct kvm_vcpu *vcpu;
+       int byte_offset, bit_nr;
+       gpa_t pendbase, ptr;
+       bool status;
+       u8 val;
+       int ret;
+       unsigned long flags;
+
+retry:
+       vcpu = irq->target_vcpu;
+       if (!vcpu)
+               return 0;
+
+       pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+
+       byte_offset = irq->intid / BITS_PER_BYTE;
+       bit_nr = irq->intid % BITS_PER_BYTE;
+       ptr = pendbase + byte_offset;
+
+       ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
+       if (ret)
+               return ret;
+
+       status = val & (1 << bit_nr);
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       if (irq->target_vcpu != vcpu) {
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               goto retry;
+       }
+       irq->pending_latch = status;
+       vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+       if (status) {
+               /* clear consumed data */
+               val &= ~(1 << bit_nr);
+               ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+/**
+ * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
+ * kvm lock and all vcpu lock must be held
+ */
+int vgic_v3_save_pending_tables(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_irq *irq;
+       gpa_t last_ptr = ~(gpa_t)0;
+       int ret;
+       u8 val;
+
+       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+               int byte_offset, bit_nr;
+               struct kvm_vcpu *vcpu;
+               gpa_t pendbase, ptr;
+               bool stored;
+
+               vcpu = irq->target_vcpu;
+               if (!vcpu)
+                       continue;
+
+               pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+
+               byte_offset = irq->intid / BITS_PER_BYTE;
+               bit_nr = irq->intid % BITS_PER_BYTE;
+               ptr = pendbase + byte_offset;
+
+               if (ptr != last_ptr) {
+                       ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
+                       if (ret)
+                               return ret;
+                       last_ptr = ptr;
+               }
+
+               stored = val & (1U << bit_nr);
+               if (stored == irq->pending_latch)
+                       continue;
+
+               if (irq->pending_latch)
+                       val |= 1 << bit_nr;
+               else
+                       val &= ~(1 << bit_nr);
+
+               ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+/**
+ * vgic_v3_rdist_overlap - check if a region overlaps with any
+ * existing redistributor region
+ *
+ * @kvm: kvm handle
+ * @base: base of the region
+ * @size: size of region
+ *
+ * Return: true if there is an overlap
+ */
+bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size)
+{
+       struct vgic_dist *d = &kvm->arch.vgic;
+       struct vgic_redist_region *rdreg;
+
+       list_for_each_entry(rdreg, &d->rd_regions, list) {
+               if ((base + size > rdreg->base) &&
+                       (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg)))
+                       return true;
+       }
+       return false;
+}
+
+/*
+ * Check for overlapping regions and for regions crossing the end of memory
+ * for base addresses which have already been set.
+ */
+bool vgic_v3_check_base(struct kvm *kvm)
+{
+       struct vgic_dist *d = &kvm->arch.vgic;
+       struct vgic_redist_region *rdreg;
+
+       if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
+           d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base)
+               return false;
+
+       list_for_each_entry(rdreg, &d->rd_regions, list) {
+               if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) <
+                       rdreg->base)
+                       return false;
+       }
+
+       if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base))
+               return true;
+
+       return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base,
+                                     KVM_VGIC_V3_DIST_SIZE);
+}
+
+/**
+ * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one
+ * which has free space to put a new rdist region.
+ *
+ * @rd_regions: redistributor region list head
+ *
+ * A redistributor regions maps n redistributors, n = region size / (2 x 64kB).
+ * Stride between redistributors is 0 and regions are filled in the index order.
+ *
+ * Return: the redist region handle, if any, that has space to map a new rdist
+ * region.
+ */
+struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions)
+{
+       struct vgic_redist_region *rdreg;
+
+       list_for_each_entry(rdreg, rd_regions, list) {
+               if (!vgic_v3_redist_region_full(rdreg))
+                       return rdreg;
+       }
+       return NULL;
+}
+
+struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
+                                                          u32 index)
+{
+       struct list_head *rd_regions = &kvm->arch.vgic.rd_regions;
+       struct vgic_redist_region *rdreg;
+
+       list_for_each_entry(rdreg, rd_regions, list) {
+               if (rdreg->index == index)
+                       return rdreg;
+       }
+       return NULL;
+}
+
+
+int vgic_v3_map_resources(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct kvm_vcpu *vcpu;
+       int ret = 0;
+       int c;
+
+       if (vgic_ready(kvm))
+               goto out;
+
+       kvm_for_each_vcpu(c, vcpu, kvm) {
+               struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+               if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) {
+                       kvm_debug("vcpu %d redistributor base not set\n", c);
+                       ret = -ENXIO;
+                       goto out;
+               }
+       }
+
+       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) {
+               kvm_err("Need to set vgic distributor addresses first\n");
+               ret = -ENXIO;
+               goto out;
+       }
+
+       if (!vgic_v3_check_base(kvm)) {
+               kvm_err("VGIC redist and dist frames overlap\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * For a VGICv3 we require the userland to explicitly initialize
+        * the VGIC before we need to use it.
+        */
+       if (!vgic_initialized(kvm)) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3);
+       if (ret) {
+               kvm_err("Unable to register VGICv3 dist MMIO regions\n");
+               goto out;
+       }
+
+       if (kvm_vgic_global_state.has_gicv4_1)
+               vgic_v4_configure_vsgis(kvm);
+       dist->ready = true;
+
+out:
+       return ret;
+}
+
+DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);
+
+static int __init early_group0_trap_cfg(char *buf)
+{
+       return strtobool(buf, &group0_trap);
+}
+early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg);
+
+static int __init early_group1_trap_cfg(char *buf)
+{
+       return strtobool(buf, &group1_trap);
+}
+early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg);
+
+static int __init early_common_trap_cfg(char *buf)
+{
+       return strtobool(buf, &common_trap);
+}
+early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg);
+
+static int __init early_gicv4_enable(char *buf)
+{
+       return strtobool(buf, &gicv4_enable);
+}
+early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
+
+/**
+ * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller
+ * @info:      pointer to the GIC description
+ *
+ * Returns 0 if the VGICv3 has been probed successfully, returns an error code
+ * otherwise
+ */
+int vgic_v3_probe(const struct gic_kvm_info *info)
+{
+       u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2);
+       int ret;
+
+       /*
+        * The ListRegs field is 5 bits, but there is a architectural
+        * maximum of 16 list registers. Just ignore bit 4...
+        */
+       kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
+       kvm_vgic_global_state.can_emulate_gicv2 = false;
+       kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2;
+
+       /* GICv4 support? */
+       if (info->has_v4) {
+               kvm_vgic_global_state.has_gicv4 = gicv4_enable;
+               kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable;
+               kvm_info("GICv4%s support %sabled\n",
+                        kvm_vgic_global_state.has_gicv4_1 ? ".1" : "",
+                        gicv4_enable ? "en" : "dis");
+       }
+
+       if (!info->vcpu.start) {
+               kvm_info("GICv3: no GICV resource entry\n");
+               kvm_vgic_global_state.vcpu_base = 0;
+       } else if (!PAGE_ALIGNED(info->vcpu.start)) {
+               pr_warn("GICV physical address 0x%llx not page aligned\n",
+                       (unsigned long long)info->vcpu.start);
+               kvm_vgic_global_state.vcpu_base = 0;
+       } else {
+               kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+               kvm_vgic_global_state.can_emulate_gicv2 = true;
+               ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+               if (ret) {
+                       kvm_err("Cannot register GICv2 KVM device.\n");
+                       return ret;
+               }
+               kvm_info("vgic-v2@%llx\n", info->vcpu.start);
+       }
+       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+       if (ret) {
+               kvm_err("Cannot register GICv3 KVM device.\n");
+               kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
+               return ret;
+       }
+
+       if (kvm_vgic_global_state.vcpu_base == 0)
+               kvm_info("disabling GICv2 emulation\n");
+
+       if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
+               group0_trap = true;
+               group1_trap = true;
+       }
+
+       if (group0_trap || group1_trap || common_trap) {
+               kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n",
+                        group0_trap ? "G0" : "",
+                        group1_trap ? "G1" : "",
+                        common_trap ? "C"  : "");
+               static_branch_enable(&vgic_v3_cpuif_trap);
+       }
+
+       kvm_vgic_global_state.vctrl_base = NULL;
+       kvm_vgic_global_state.type = VGIC_V3;
+       kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
+
+       return 0;
+}
+
+void vgic_v3_load(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       /*
+        * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
+        * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
+        * VMCR_EL2 save/restore in the world switch.
+        */
+       if (likely(cpu_if->vgic_sre))
+               kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
+
+       kvm_call_hyp(__vgic_v3_restore_aprs, vcpu);
+
+       if (has_vhe())
+               __vgic_v3_activate_traps(vcpu);
+
+       WARN_ON(vgic_v4_load(vcpu));
+}
+
+void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       if (likely(cpu_if->vgic_sre))
+               cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr);
+}
+
+void vgic_v3_put(struct kvm_vcpu *vcpu)
+{
+       WARN_ON(vgic_v4_put(vcpu, false));
+
+       vgic_v3_vmcr_sync(vcpu);
+
+       kvm_call_hyp(__vgic_v3_save_aprs, vcpu);
+
+       if (has_vhe())
+               __vgic_v3_deactivate_traps(vcpu);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
new file mode 100644 (file)
index 0000000..27ac833
--- /dev/null
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/kvm_host.h>
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include "vgic.h"
+
+/*
+ * How KVM uses GICv4 (insert rude comments here):
+ *
+ * The vgic-v4 layer acts as a bridge between several entities:
+ * - The GICv4 ITS representation offered by the ITS driver
+ * - VFIO, which is in charge of the PCI endpoint
+ * - The virtual ITS, which is the only thing the guest sees
+ *
+ * The configuration of VLPIs is triggered by a callback from VFIO,
+ * instructing KVM that a PCI device has been configured to deliver
+ * MSIs to a vITS.
+ *
+ * kvm_vgic_v4_set_forwarding() is thus called with the routing entry,
+ * and this is used to find the corresponding vITS data structures
+ * (ITS instance, device, event and irq) using a process that is
+ * extremely similar to the injection of an MSI.
+ *
+ * At this stage, we can link the guest's view of an LPI (uniquely
+ * identified by the routing entry) and the host irq, using the GICv4
+ * driver mapping operation. Should the mapping succeed, we've then
+ * successfully upgraded the guest's LPI to a VLPI. We can then start
+ * with updating GICv4's view of the property table and generating an
+ * INValidation in order to kickstart the delivery of this VLPI to the
+ * guest directly, without software intervention. Well, almost.
+ *
+ * When the PCI endpoint is deconfigured, this operation is reversed
+ * with VFIO calling kvm_vgic_v4_unset_forwarding().
+ *
+ * Once the VLPI has been mapped, it needs to follow any change the
+ * guest performs on its LPI through the vITS. For that, a number of
+ * command handlers have hooks to communicate these changes to the HW:
+ * - Any invalidation triggers a call to its_prop_update_vlpi()
+ * - The INT command results in a irq_set_irqchip_state(), which
+ *   generates an INT on the corresponding VLPI.
+ * - The CLEAR command results in a irq_set_irqchip_state(), which
+ *   generates an CLEAR on the corresponding VLPI.
+ * - DISCARD translates into an unmap, similar to a call to
+ *   kvm_vgic_v4_unset_forwarding().
+ * - MOVI is translated by an update of the existing mapping, changing
+ *   the target vcpu, resulting in a VMOVI being generated.
+ * - MOVALL is translated by a string of mapping updates (similar to
+ *   the handling of MOVI). MOVALL is horrible.
+ *
+ * Note that a DISCARD/MAPTI sequence emitted from the guest without
+ * reprogramming the PCI endpoint after MAPTI does not result in a
+ * VLPI being mapped, as there is no callback from VFIO (the guest
+ * will get the interrupt via the normal SW injection). Fixing this is
+ * not trivial, and requires some horrible messing with the VFIO
+ * internals. Not fun. Don't do that.
+ *
+ * Then there is the scheduling. Each time a vcpu is about to run on a
+ * physical CPU, KVM must tell the corresponding redistributor about
+ * it. And if we've migrated our vcpu from one CPU to another, we must
+ * tell the ITS (so that the messages reach the right redistributor).
+ * This is done in two steps: first issue a irq_set_affinity() on the
+ * irq corresponding to the vcpu, then call its_make_vpe_resident().
+ * You must be in a non-preemptible context. On exit, a call to
+ * its_make_vpe_non_resident() tells the redistributor that we're done
+ * with the vcpu.
+ *
+ * Finally, the doorbell handling: Each vcpu is allocated an interrupt
+ * which will fire each time a VLPI is made pending whilst the vcpu is
+ * not running. Each time the vcpu gets blocked, the doorbell
+ * interrupt gets enabled. When the vcpu is unblocked (for whatever
+ * reason), the doorbell interrupt is disabled.
+ */
+
+#define DB_IRQ_FLAGS   (IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING)
+
+static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info)
+{
+       struct kvm_vcpu *vcpu = info;
+
+       /* We got the message, no need to fire again */
+       if (!kvm_vgic_global_state.has_gicv4_1 &&
+           !irqd_irq_disabled(&irq_to_desc(irq)->irq_data))
+               disable_irq_nosync(irq);
+
+       vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true;
+       kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+       kvm_vcpu_kick(vcpu);
+
+       return IRQ_HANDLED;
+}
+
+static void vgic_v4_sync_sgi_config(struct its_vpe *vpe, struct vgic_irq *irq)
+{
+       vpe->sgi_config[irq->intid].enabled     = irq->enabled;
+       vpe->sgi_config[irq->intid].group       = irq->group;
+       vpe->sgi_config[irq->intid].priority    = irq->priority;
+}
+
+static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu)
+{
+       struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+       int i;
+
+       /*
+        * With GICv4.1, every virtual SGI can be directly injected. So
+        * let's pretend that they are HW interrupts, tied to a host
+        * IRQ. The SGI code will do its magic.
+        */
+       for (i = 0; i < VGIC_NR_SGIS; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
+               struct irq_desc *desc;
+               unsigned long flags;
+               int ret;
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+               if (irq->hw)
+                       goto unlock;
+
+               irq->hw = true;
+               irq->host_irq = irq_find_mapping(vpe->sgi_domain, i);
+
+               /* Transfer the full irq state to the vPE */
+               vgic_v4_sync_sgi_config(vpe, irq);
+               desc = irq_to_desc(irq->host_irq);
+               ret = irq_domain_activate_irq(irq_desc_get_irq_data(desc),
+                                             false);
+               if (!WARN_ON(ret)) {
+                       /* Transfer pending state */
+                       ret = irq_set_irqchip_state(irq->host_irq,
+                                                   IRQCHIP_STATE_PENDING,
+                                                   irq->pending_latch);
+                       WARN_ON(ret);
+                       irq->pending_latch = false;
+               }
+       unlock:
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
+{
+       int i;
+
+       for (i = 0; i < VGIC_NR_SGIS; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
+               struct irq_desc *desc;
+               unsigned long flags;
+               int ret;
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+               if (!irq->hw)
+                       goto unlock;
+
+               irq->hw = false;
+               ret = irq_get_irqchip_state(irq->host_irq,
+                                           IRQCHIP_STATE_PENDING,
+                                           &irq->pending_latch);
+               WARN_ON(ret);
+
+               desc = irq_to_desc(irq->host_irq);
+               irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+       unlock:
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+}
+
+/* Must be called with the kvm lock held */
+void vgic_v4_configure_vsgis(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       kvm_arm_halt_guest(kvm);
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (dist->nassgireq)
+                       vgic_v4_enable_vsgis(vcpu);
+               else
+                       vgic_v4_disable_vsgis(vcpu);
+       }
+
+       kvm_arm_resume_guest(kvm);
+}
+
+/**
+ * vgic_v4_init - Initialize the GICv4 data structures
+ * @kvm:       Pointer to the VM being initialized
+ *
+ * We may be called each time a vITS is created, or when the
+ * vgic is initialized. This relies on kvm->lock to be
+ * held. In both cases, the number of vcpus should now be
+ * fixed.
+ */
+int vgic_v4_init(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct kvm_vcpu *vcpu;
+       int i, nr_vcpus, ret;
+
+       if (!kvm_vgic_global_state.has_gicv4)
+               return 0; /* Nothing to see here... move along. */
+
+       if (dist->its_vm.vpes)
+               return 0;
+
+       nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+       dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes),
+                                   GFP_KERNEL);
+       if (!dist->its_vm.vpes)
+               return -ENOMEM;
+
+       dist->its_vm.nr_vpes = nr_vcpus;
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+
+       ret = its_alloc_vcpu_irqs(&dist->its_vm);
+       if (ret < 0) {
+               kvm_err("VPE IRQ allocation failure\n");
+               kfree(dist->its_vm.vpes);
+               dist->its_vm.nr_vpes = 0;
+               dist->its_vm.vpes = NULL;
+               return ret;
+       }
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               int irq = dist->its_vm.vpes[i]->irq;
+               unsigned long irq_flags = DB_IRQ_FLAGS;
+
+               /*
+                * Don't automatically enable the doorbell, as we're
+                * flipping it back and forth when the vcpu gets
+                * blocked. Also disable the lazy disabling, as the
+                * doorbell could kick us out of the guest too
+                * early...
+                *
+                * On GICv4.1, the doorbell is managed in HW and must
+                * be left enabled.
+                */
+               if (kvm_vgic_global_state.has_gicv4_1)
+                       irq_flags &= ~IRQ_NOAUTOEN;
+               irq_set_status_flags(irq, irq_flags);
+
+               ret = request_irq(irq, vgic_v4_doorbell_handler,
+                                 0, "vcpu", vcpu);
+               if (ret) {
+                       kvm_err("failed to allocate vcpu IRQ%d\n", irq);
+                       /*
+                        * Trick: adjust the number of vpes so we know
+                        * how many to nuke on teardown...
+                        */
+                       dist->its_vm.nr_vpes = i;
+                       break;
+               }
+       }
+
+       if (ret)
+               vgic_v4_teardown(kvm);
+
+       return ret;
+}
+
+/**
+ * vgic_v4_teardown - Free the GICv4 data structures
+ * @kvm:       Pointer to the VM being destroyed
+ *
+ * Relies on kvm->lock to be held.
+ */
+void vgic_v4_teardown(struct kvm *kvm)
+{
+       struct its_vm *its_vm = &kvm->arch.vgic.its_vm;
+       int i;
+
+       if (!its_vm->vpes)
+               return;
+
+       for (i = 0; i < its_vm->nr_vpes; i++) {
+               struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i);
+               int irq = its_vm->vpes[i]->irq;
+
+               irq_clear_status_flags(irq, DB_IRQ_FLAGS);
+               free_irq(irq, vcpu);
+       }
+
+       its_free_vcpu_irqs(its_vm);
+       kfree(its_vm->vpes);
+       its_vm->nr_vpes = 0;
+       its_vm->vpes = NULL;
+}
+
+int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db)
+{
+       struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+
+       if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
+               return 0;
+
+       return its_make_vpe_non_resident(vpe, need_db);
+}
+
+int vgic_v4_load(struct kvm_vcpu *vcpu)
+{
+       struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+       int err;
+
+       if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
+               return 0;
+
+       /*
+        * Before making the VPE resident, make sure the redistributor
+        * corresponding to our current CPU expects us here. See the
+        * doc in drivers/irqchip/irq-gic-v4.c to understand how this
+        * turns into a VMOVP command at the ITS level.
+        */
+       err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id()));
+       if (err)
+               return err;
+
+       err = its_make_vpe_resident(vpe, false, vcpu->kvm->arch.vgic.enabled);
+       if (err)
+               return err;
+
+       /*
+        * Now that the VPE is resident, let's get rid of a potential
+        * doorbell interrupt that would still be pending. This is a
+        * GICv4.0 only "feature"...
+        */
+       if (!kvm_vgic_global_state.has_gicv4_1)
+               err = irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false);
+
+       return err;
+}
+
+static struct vgic_its *vgic_get_its(struct kvm *kvm,
+                                    struct kvm_kernel_irq_routing_entry *irq_entry)
+{
+       struct kvm_msi msi  = (struct kvm_msi) {
+               .address_lo     = irq_entry->msi.address_lo,
+               .address_hi     = irq_entry->msi.address_hi,
+               .data           = irq_entry->msi.data,
+               .flags          = irq_entry->msi.flags,
+               .devid          = irq_entry->msi.devid,
+       };
+
+       return vgic_msi_to_its(kvm, &msi);
+}
+
+int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
+                              struct kvm_kernel_irq_routing_entry *irq_entry)
+{
+       struct vgic_its *its;
+       struct vgic_irq *irq;
+       struct its_vlpi_map map;
+       int ret;
+
+       if (!vgic_supports_direct_msis(kvm))
+               return 0;
+
+       /*
+        * Get the ITS, and escape early on error (not a valid
+        * doorbell for any of our vITSs).
+        */
+       its = vgic_get_its(kvm, irq_entry);
+       if (IS_ERR(its))
+               return 0;
+
+       mutex_lock(&its->its_lock);
+
+       /* Perform the actual DevID/EventID -> LPI translation. */
+       ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
+                                  irq_entry->msi.data, &irq);
+       if (ret)
+               goto out;
+
+       /*
+        * Emit the mapping request. If it fails, the ITS probably
+        * isn't v4 compatible, so let's silently bail out. Holding
+        * the ITS lock should ensure that nothing can modify the
+        * target vcpu.
+        */
+       map = (struct its_vlpi_map) {
+               .vm             = &kvm->arch.vgic.its_vm,
+               .vpe            = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe,
+               .vintid         = irq->intid,
+               .properties     = ((irq->priority & 0xfc) |
+                                  (irq->enabled ? LPI_PROP_ENABLED : 0) |
+                                  LPI_PROP_GROUP1),
+               .db_enabled     = true,
+       };
+
+       ret = its_map_vlpi(virq, &map);
+       if (ret)
+               goto out;
+
+       irq->hw         = true;
+       irq->host_irq   = virq;
+       atomic_inc(&map.vpe->vlpi_count);
+
+out:
+       mutex_unlock(&its->its_lock);
+       return ret;
+}
+
+int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
+                                struct kvm_kernel_irq_routing_entry *irq_entry)
+{
+       struct vgic_its *its;
+       struct vgic_irq *irq;
+       int ret;
+
+       if (!vgic_supports_direct_msis(kvm))
+               return 0;
+
+       /*
+        * Get the ITS, and escape early on error (not a valid
+        * doorbell for any of our vITSs).
+        */
+       its = vgic_get_its(kvm, irq_entry);
+       if (IS_ERR(its))
+               return 0;
+
+       mutex_lock(&its->its_lock);
+
+       ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
+                                  irq_entry->msi.data, &irq);
+       if (ret)
+               goto out;
+
+       WARN_ON(!(irq->hw && irq->host_irq == virq));
+       if (irq->hw) {
+               atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
+               irq->hw = false;
+               ret = its_unmap_vlpi(virq);
+       }
+
+out:
+       mutex_unlock(&its->its_lock);
+       return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
new file mode 100644 (file)
index 0000000..99b02ca
--- /dev/null
@@ -0,0 +1,1011 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/list_sort.h>
+#include <linux/nospec.h>
+
+#include <asm/kvm_hyp.h>
+
+#include "vgic.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+struct vgic_global kvm_vgic_global_state __ro_after_init = {
+       .gicv3_cpuif = STATIC_KEY_FALSE_INIT,
+};
+
+/*
+ * Locking order is always:
+ * kvm->lock (mutex)
+ *   its->cmd_lock (mutex)
+ *     its->its_lock (mutex)
+ *       vgic_cpu->ap_list_lock                must be taken with IRQs disabled
+ *         kvm->lpi_list_lock          must be taken with IRQs disabled
+ *           vgic_irq->irq_lock                must be taken with IRQs disabled
+ *
+ * As the ap_list_lock might be taken from the timer interrupt handler,
+ * we have to disable IRQs before taking this lock and everything lower
+ * than it.
+ *
+ * If you need to take multiple locks, always take the upper lock first,
+ * then the lower ones, e.g. first take the its_lock, then the irq_lock.
+ * If you are already holding a lock and need to take a higher one, you
+ * have to drop the lower ranking lock first and re-aquire it after having
+ * taken the upper one.
+ *
+ * When taking more than one ap_list_lock at the same time, always take the
+ * lowest numbered VCPU's ap_list_lock first, so:
+ *   vcpuX->vcpu_id < vcpuY->vcpu_id:
+ *     raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
+ *     raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
+ *
+ * Since the VGIC must support injecting virtual interrupts from ISRs, we have
+ * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer
+ * spinlocks for any lock that may be taken while injecting an interrupt.
+ */
+
+/*
+ * Iterate over the VM's list of mapped LPIs to find the one with a
+ * matching interrupt ID and return a reference to the IRQ structure.
+ */
+static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_irq *irq = NULL;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+               if (irq->intid != intid)
+                       continue;
+
+               /*
+                * This increases the refcount, the caller is expected to
+                * call vgic_put_irq() later once it's finished with the IRQ.
+                */
+               vgic_get_irq_kref(irq);
+               goto out_unlock;
+       }
+       irq = NULL;
+
+out_unlock:
+       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+       return irq;
+}
+
+/*
+ * This looks up the virtual interrupt ID to get the corresponding
+ * struct vgic_irq. It also increases the refcount, so any caller is expected
+ * to call vgic_put_irq() once it's finished with this IRQ.
+ */
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+                             u32 intid)
+{
+       /* SGIs and PPIs */
+       if (intid <= VGIC_MAX_PRIVATE) {
+               intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1);
+               return &vcpu->arch.vgic_cpu.private_irqs[intid];
+       }
+
+       /* SPIs */
+       if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
+               intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
+               return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
+       }
+
+       /* LPIs */
+       if (intid >= VGIC_MIN_LPI)
+               return vgic_get_lpi(kvm, intid);
+
+       WARN(1, "Looking up struct vgic_irq for reserved INTID");
+       return NULL;
+}
+
+/*
+ * We can't do anything in here, because we lack the kvm pointer to
+ * lock and remove the item from the lpi_list. So we keep this function
+ * empty and use the return value of kref_put() to trigger the freeing.
+ */
+static void vgic_irq_release(struct kref *ref)
+{
+}
+
+/*
+ * Drop the refcount on the LPI. Must be called with lpi_list_lock held.
+ */
+void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+
+       if (!kref_put(&irq->refcount, vgic_irq_release))
+               return;
+
+       list_del(&irq->lpi_list);
+       dist->lpi_list_count--;
+
+       kfree(irq);
+}
+
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       unsigned long flags;
+
+       if (irq->intid < VGIC_MIN_LPI)
+               return;
+
+       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+       __vgic_put_lpi_locked(kvm, irq);
+       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+}
+
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq, *tmp;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+
+       list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+               if (irq->intid >= VGIC_MIN_LPI) {
+                       raw_spin_lock(&irq->irq_lock);
+                       list_del(&irq->ap_list);
+                       irq->vcpu = NULL;
+                       raw_spin_unlock(&irq->irq_lock);
+                       vgic_put_irq(vcpu->kvm, irq);
+               }
+       }
+
+       raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+}
+
+void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
+{
+       WARN_ON(irq_set_irqchip_state(irq->host_irq,
+                                     IRQCHIP_STATE_PENDING,
+                                     pending));
+}
+
+bool vgic_get_phys_line_level(struct vgic_irq *irq)
+{
+       bool line_level;
+
+       BUG_ON(!irq->hw);
+
+       if (irq->get_input_level)
+               return irq->get_input_level(irq->intid);
+
+       WARN_ON(irq_get_irqchip_state(irq->host_irq,
+                                     IRQCHIP_STATE_PENDING,
+                                     &line_level));
+       return line_level;
+}
+
+/* Set/Clear the physical active state */
+void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
+{
+
+       BUG_ON(!irq->hw);
+       WARN_ON(irq_set_irqchip_state(irq->host_irq,
+                                     IRQCHIP_STATE_ACTIVE,
+                                     active));
+}
+
+/**
+ * kvm_vgic_target_oracle - compute the target vcpu for an irq
+ *
+ * @irq:       The irq to route. Must be already locked.
+ *
+ * Based on the current state of the interrupt (enabled, pending,
+ * active, vcpu and target_vcpu), compute the next vcpu this should be
+ * given to. Return NULL if this shouldn't be injected at all.
+ *
+ * Requires the IRQ lock to be held.
+ */
+static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
+{
+       lockdep_assert_held(&irq->irq_lock);
+
+       /* If the interrupt is active, it must stay on the current vcpu */
+       if (irq->active)
+               return irq->vcpu ? : irq->target_vcpu;
+
+       /*
+        * If the IRQ is not active but enabled and pending, we should direct
+        * it to its configured target VCPU.
+        * If the distributor is disabled, pending interrupts shouldn't be
+        * forwarded.
+        */
+       if (irq->enabled && irq_is_pending(irq)) {
+               if (unlikely(irq->target_vcpu &&
+                            !irq->target_vcpu->kvm->arch.vgic.enabled))
+                       return NULL;
+
+               return irq->target_vcpu;
+       }
+
+       /* If neither active nor pending and enabled, then this IRQ should not
+        * be queued to any VCPU.
+        */
+       return NULL;
+}
+
+/*
+ * The order of items in the ap_lists defines how we'll pack things in LRs as
+ * well, the first items in the list being the first things populated in the
+ * LRs.
+ *
+ * A hard rule is that active interrupts can never be pushed out of the LRs
+ * (and therefore take priority) since we cannot reliably trap on deactivation
+ * of IRQs and therefore they have to be present in the LRs.
+ *
+ * Otherwise things should be sorted by the priority field and the GIC
+ * hardware support will take care of preemption of priority groups etc.
+ *
+ * Return negative if "a" sorts before "b", 0 to preserve order, and positive
+ * to sort "b" before "a".
+ */
+static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+       struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
+       struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
+       bool penda, pendb;
+       int ret;
+
+       /*
+        * list_sort may call this function with the same element when
+        * the list is fairly long.
+        */
+       if (unlikely(irqa == irqb))
+               return 0;
+
+       raw_spin_lock(&irqa->irq_lock);
+       raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
+
+       if (irqa->active || irqb->active) {
+               ret = (int)irqb->active - (int)irqa->active;
+               goto out;
+       }
+
+       penda = irqa->enabled && irq_is_pending(irqa);
+       pendb = irqb->enabled && irq_is_pending(irqb);
+
+       if (!penda || !pendb) {
+               ret = (int)pendb - (int)penda;
+               goto out;
+       }
+
+       /* Both pending and enabled, sort by priority */
+       ret = irqa->priority - irqb->priority;
+out:
+       raw_spin_unlock(&irqb->irq_lock);
+       raw_spin_unlock(&irqa->irq_lock);
+       return ret;
+}
+
+/* Must be called with the ap_list_lock held */
+static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       lockdep_assert_held(&vgic_cpu->ap_list_lock);
+
+       list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
+}
+
+/*
+ * Only valid injection if changing level for level-triggered IRQs or for a
+ * rising edge, and in-kernel connected IRQ lines can only be controlled by
+ * their owner.
+ */
+static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner)
+{
+       if (irq->owner != owner)
+               return false;
+
+       switch (irq->config) {
+       case VGIC_CONFIG_LEVEL:
+               return irq->line_level != level;
+       case VGIC_CONFIG_EDGE:
+               return level;
+       }
+
+       return false;
+}
+
+/*
+ * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
+ * Do the queuing if necessary, taking the right locks in the right order.
+ * Returns true when the IRQ was queued, false otherwise.
+ *
+ * Needs to be entered with the IRQ lock already held, but will return
+ * with all locks dropped.
+ */
+bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
+                          unsigned long flags)
+{
+       struct kvm_vcpu *vcpu;
+
+       lockdep_assert_held(&irq->irq_lock);
+
+retry:
+       vcpu = vgic_target_oracle(irq);
+       if (irq->vcpu || !vcpu) {
+               /*
+                * If this IRQ is already on a VCPU's ap_list, then it
+                * cannot be moved or modified and there is no more work for
+                * us to do.
+                *
+                * Otherwise, if the irq is not pending and enabled, it does
+                * not need to be inserted into an ap_list and there is also
+                * no more work for us to do.
+                */
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+               /*
+                * We have to kick the VCPU here, because we could be
+                * queueing an edge-triggered interrupt for which we
+                * get no EOI maintenance interrupt. In that case,
+                * while the IRQ is already on the VCPU's AP list, the
+                * VCPU could have EOI'ed the original interrupt and
+                * won't see this one until it exits for some other
+                * reason.
+                */
+               if (vcpu) {
+                       kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+                       kvm_vcpu_kick(vcpu);
+               }
+               return false;
+       }
+
+       /*
+        * We must unlock the irq lock to take the ap_list_lock where
+        * we are going to insert this new pending interrupt.
+        */
+       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+       /* someone can do stuff here, which we re-check below */
+
+       raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+       raw_spin_lock(&irq->irq_lock);
+
+       /*
+        * Did something change behind our backs?
+        *
+        * There are two cases:
+        * 1) The irq lost its pending state or was disabled behind our
+        *    backs and/or it was queued to another VCPU's ap_list.
+        * 2) Someone changed the affinity on this irq behind our
+        *    backs and we are now holding the wrong ap_list_lock.
+        *
+        * In both cases, drop the locks and retry.
+        */
+
+       if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) {
+               raw_spin_unlock(&irq->irq_lock);
+               raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock,
+                                          flags);
+
+               raw_spin_lock_irqsave(&irq->irq_lock, flags);
+               goto retry;
+       }
+
+       /*
+        * Grab a reference to the irq to reflect the fact that it is
+        * now in the ap_list.
+        */
+       vgic_get_irq_kref(irq);
+       list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
+       irq->vcpu = vcpu;
+
+       raw_spin_unlock(&irq->irq_lock);
+       raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+
+       kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+       kvm_vcpu_kick(vcpu);
+
+       return true;
+}
+
+/**
+ * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
+ * @kvm:     The VM structure pointer
+ * @cpuid:   The CPU for PPIs
+ * @intid:   The INTID to inject a new state to.
+ * @level:   Edge-triggered:  true:  to trigger the interrupt
+ *                           false: to ignore the call
+ *          Level-sensitive  true:  raise the input signal
+ *                           false: lower the input signal
+ * @owner:   The opaque pointer to the owner of the IRQ being raised to verify
+ *           that the caller is allowed to inject this IRQ.  Userspace
+ *           injections will have owner == NULL.
+ *
+ * The VGIC is not concerned with devices being active-LOW or active-HIGH for
+ * level-sensitive interrupts.  You can think of the level parameter as 1
+ * being HIGH and 0 being LOW and all devices being active-HIGH.
+ */
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+                       bool level, void *owner)
+{
+       struct kvm_vcpu *vcpu;
+       struct vgic_irq *irq;
+       unsigned long flags;
+       int ret;
+
+       trace_vgic_update_irq_pending(cpuid, intid, level);
+
+       ret = vgic_lazy_init(kvm);
+       if (ret)
+               return ret;
+
+       vcpu = kvm_get_vcpu(kvm, cpuid);
+       if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
+               return -EINVAL;
+
+       irq = vgic_get_irq(kvm, vcpu, intid);
+       if (!irq)
+               return -EINVAL;
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+       if (!vgic_validate_injection(irq, level, owner)) {
+               /* Nothing to see here, move along... */
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+               vgic_put_irq(kvm, irq);
+               return 0;
+       }
+
+       if (irq->config == VGIC_CONFIG_LEVEL)
+               irq->line_level = level;
+       else
+               irq->pending_latch = true;
+
+       vgic_queue_irq_unlock(kvm, irq, flags);
+       vgic_put_irq(kvm, irq);
+
+       return 0;
+}
+
+/* @irq->irq_lock must be held */
+static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+                           unsigned int host_irq,
+                           bool (*get_input_level)(int vindid))
+{
+       struct irq_desc *desc;
+       struct irq_data *data;
+
+       /*
+        * Find the physical IRQ number corresponding to @host_irq
+        */
+       desc = irq_to_desc(host_irq);
+       if (!desc) {
+               kvm_err("%s: no interrupt descriptor\n", __func__);
+               return -EINVAL;
+       }
+       data = irq_desc_get_irq_data(desc);
+       while (data->parent_data)
+               data = data->parent_data;
+
+       irq->hw = true;
+       irq->host_irq = host_irq;
+       irq->hwintid = data->hwirq;
+       irq->get_input_level = get_input_level;
+       return 0;
+}
+
+/* @irq->irq_lock must be held */
+static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
+{
+       irq->hw = false;
+       irq->hwintid = 0;
+       irq->get_input_level = NULL;
+}
+
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
+                         u32 vintid, bool (*get_input_level)(int vindid))
+{
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+       unsigned long flags;
+       int ret;
+
+       BUG_ON(!irq);
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
+       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+       vgic_put_irq(vcpu->kvm, irq);
+
+       return ret;
+}
+
+/**
+ * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ
+ * @vcpu: The VCPU pointer
+ * @vintid: The INTID of the interrupt
+ *
+ * Reset the active and pending states of a mapped interrupt.  Kernel
+ * subsystems injecting mapped interrupts should reset their interrupt lines
+ * when we are doing a reset of the VM.
+ */
+void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid)
+{
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+       unsigned long flags;
+
+       if (!irq->hw)
+               goto out;
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       irq->active = false;
+       irq->pending_latch = false;
+       irq->line_level = false;
+       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+out:
+       vgic_put_irq(vcpu->kvm, irq);
+}
+
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
+{
+       struct vgic_irq *irq;
+       unsigned long flags;
+
+       if (!vgic_initialized(vcpu->kvm))
+               return -EAGAIN;
+
+       irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+       BUG_ON(!irq);
+
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       kvm_vgic_unmap_irq(irq);
+       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+       vgic_put_irq(vcpu->kvm, irq);
+
+       return 0;
+}
+
+/**
+ * kvm_vgic_set_owner - Set the owner of an interrupt for a VM
+ *
+ * @vcpu:   Pointer to the VCPU (used for PPIs)
+ * @intid:  The virtual INTID identifying the interrupt (PPI or SPI)
+ * @owner:  Opaque pointer to the owner
+ *
+ * Returns 0 if intid is not already used by another in-kernel device and the
+ * owner is set, otherwise returns an error code.
+ */
+int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
+{
+       struct vgic_irq *irq;
+       unsigned long flags;
+       int ret = 0;
+
+       if (!vgic_initialized(vcpu->kvm))
+               return -EAGAIN;
+
+       /* SGIs and LPIs cannot be wired up to any device */
+       if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
+               return -EINVAL;
+
+       irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       if (irq->owner && irq->owner != owner)
+               ret = -EEXIST;
+       else
+               irq->owner = owner;
+       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+       return ret;
+}
+
+/**
+ * vgic_prune_ap_list - Remove non-relevant interrupts from the list
+ *
+ * @vcpu: The VCPU pointer
+ *
+ * Go over the list of "interesting" interrupts, and prune those that we
+ * won't have to consider in the near future.
+ */
+static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq, *tmp;
+
+       DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+retry:
+       raw_spin_lock(&vgic_cpu->ap_list_lock);
+
+       list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+               struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
+               bool target_vcpu_needs_kick = false;
+
+               raw_spin_lock(&irq->irq_lock);
+
+               BUG_ON(vcpu != irq->vcpu);
+
+               target_vcpu = vgic_target_oracle(irq);
+
+               if (!target_vcpu) {
+                       /*
+                        * We don't need to process this interrupt any
+                        * further, move it off the list.
+                        */
+                       list_del(&irq->ap_list);
+                       irq->vcpu = NULL;
+                       raw_spin_unlock(&irq->irq_lock);
+
+                       /*
+                        * This vgic_put_irq call matches the
+                        * vgic_get_irq_kref in vgic_queue_irq_unlock,
+                        * where we added the LPI to the ap_list. As
+                        * we remove the irq from the list, we drop
+                        * also drop the refcount.
+                        */
+                       vgic_put_irq(vcpu->kvm, irq);
+                       continue;
+               }
+
+               if (target_vcpu == vcpu) {
+                       /* We're on the right CPU */
+                       raw_spin_unlock(&irq->irq_lock);
+                       continue;
+               }
+
+               /* This interrupt looks like it has to be migrated. */
+
+               raw_spin_unlock(&irq->irq_lock);
+               raw_spin_unlock(&vgic_cpu->ap_list_lock);
+
+               /*
+                * Ensure locking order by always locking the smallest
+                * ID first.
+                */
+               if (vcpu->vcpu_id < target_vcpu->vcpu_id) {
+                       vcpuA = vcpu;
+                       vcpuB = target_vcpu;
+               } else {
+                       vcpuA = target_vcpu;
+                       vcpuB = vcpu;
+               }
+
+               raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+               raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
+                                     SINGLE_DEPTH_NESTING);
+               raw_spin_lock(&irq->irq_lock);
+
+               /*
+                * If the affinity has been preserved, move the
+                * interrupt around. Otherwise, it means things have
+                * changed while the interrupt was unlocked, and we
+                * need to replay this.
+                *
+                * In all cases, we cannot trust the list not to have
+                * changed, so we restart from the beginning.
+                */
+               if (target_vcpu == vgic_target_oracle(irq)) {
+                       struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;
+
+                       list_del(&irq->ap_list);
+                       irq->vcpu = target_vcpu;
+                       list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
+                       target_vcpu_needs_kick = true;
+               }
+
+               raw_spin_unlock(&irq->irq_lock);
+               raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
+               raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+
+               if (target_vcpu_needs_kick) {
+                       kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu);
+                       kvm_vcpu_kick(target_vcpu);
+               }
+
+               goto retry;
+       }
+
+       raw_spin_unlock(&vgic_cpu->ap_list_lock);
+}
+
+static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_fold_lr_state(vcpu);
+       else
+               vgic_v3_fold_lr_state(vcpu);
+}
+
+/* Requires the irq_lock to be held. */
+static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
+                                   struct vgic_irq *irq, int lr)
+{
+       lockdep_assert_held(&irq->irq_lock);
+
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_populate_lr(vcpu, irq, lr);
+       else
+               vgic_v3_populate_lr(vcpu, irq, lr);
+}
+
+static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_clear_lr(vcpu, lr);
+       else
+               vgic_v3_clear_lr(vcpu, lr);
+}
+
+static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_set_underflow(vcpu);
+       else
+               vgic_v3_set_underflow(vcpu);
+}
+
+/* Requires the ap_list_lock to be held. */
+static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
+                                bool *multi_sgi)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq;
+       int count = 0;
+
+       *multi_sgi = false;
+
+       lockdep_assert_held(&vgic_cpu->ap_list_lock);
+
+       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+               int w;
+
+               raw_spin_lock(&irq->irq_lock);
+               /* GICv2 SGIs can count for more than one... */
+               w = vgic_irq_get_lr_count(irq);
+               raw_spin_unlock(&irq->irq_lock);
+
+               count += w;
+               *multi_sgi |= (w > 1);
+       }
+       return count;
+}
+
+/* Requires the VCPU's ap_list_lock to be held. */
+static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq;
+       int count;
+       bool multi_sgi;
+       u8 prio = 0xff;
+
+       lockdep_assert_held(&vgic_cpu->ap_list_lock);
+
+       count = compute_ap_list_depth(vcpu, &multi_sgi);
+       if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
+               vgic_sort_ap_list(vcpu);
+
+       count = 0;
+
+       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+               raw_spin_lock(&irq->irq_lock);
+
+               /*
+                * If we have multi-SGIs in the pipeline, we need to
+                * guarantee that they are all seen before any IRQ of
+                * lower priority. In that case, we need to filter out
+                * these interrupts by exiting early. This is easy as
+                * the AP list has been sorted already.
+                */
+               if (multi_sgi && irq->priority > prio) {
+                       _raw_spin_unlock(&irq->irq_lock);
+                       break;
+               }
+
+               if (likely(vgic_target_oracle(irq) == vcpu)) {
+                       vgic_populate_lr(vcpu, irq, count++);
+
+                       if (irq->source)
+                               prio = irq->priority;
+               }
+
+               raw_spin_unlock(&irq->irq_lock);
+
+               if (count == kvm_vgic_global_state.nr_lr) {
+                       if (!list_is_last(&irq->ap_list,
+                                         &vgic_cpu->ap_list_head))
+                               vgic_set_underflow(vcpu);
+                       break;
+               }
+       }
+
+       vcpu->arch.vgic_cpu.used_lrs = count;
+
+       /* Nuke remaining LRs */
+       for ( ; count < kvm_vgic_global_state.nr_lr; count++)
+               vgic_clear_lr(vcpu, count);
+}
+
+static inline bool can_access_vgic_from_kernel(void)
+{
+       /*
+        * GICv2 can always be accessed from the kernel because it is
+        * memory-mapped, and VHE systems can access GICv3 EL2 system
+        * registers.
+        */
+       return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe();
+}
+
+static inline void vgic_save_state(struct kvm_vcpu *vcpu)
+{
+       if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+               vgic_v2_save_state(vcpu);
+       else
+               __vgic_v3_save_state(vcpu);
+}
+
+/* Sync back the hardware VGIC state into our emulation after a guest's run. */
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       /* An empty ap_list_head implies used_lrs == 0 */
+       if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
+               return;
+
+       if (can_access_vgic_from_kernel())
+               vgic_save_state(vcpu);
+
+       if (vgic_cpu->used_lrs)
+               vgic_fold_lr_state(vcpu);
+       vgic_prune_ap_list(vcpu);
+}
+
+static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
+{
+       if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+               vgic_v2_restore_state(vcpu);
+       else
+               __vgic_v3_restore_state(vcpu);
+}
+
+/* Flush our emulation state into the GIC hardware before entering the guest. */
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+       /*
+        * If there are no virtual interrupts active or pending for this
+        * VCPU, then there is no work to do and we can bail out without
+        * taking any lock.  There is a potential race with someone injecting
+        * interrupts to the VCPU, but it is a benign race as the VCPU will
+        * either observe the new interrupt before or after doing this check,
+        * and introducing additional synchronization mechanism doesn't change
+        * this.
+        *
+        * Note that we still need to go through the whole thing if anything
+        * can be directly injected (GICv4).
+        */
+       if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
+           !vgic_supports_direct_msis(vcpu->kvm))
+               return;
+
+       DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+       if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
+               raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+               vgic_flush_lr_state(vcpu);
+               raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+       }
+
+       if (can_access_vgic_from_kernel())
+               vgic_restore_state(vcpu);
+}
+
+void kvm_vgic_load(struct kvm_vcpu *vcpu)
+{
+       if (unlikely(!vgic_initialized(vcpu->kvm)))
+               return;
+
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_load(vcpu);
+       else
+               vgic_v3_load(vcpu);
+}
+
+void kvm_vgic_put(struct kvm_vcpu *vcpu)
+{
+       if (unlikely(!vgic_initialized(vcpu->kvm)))
+               return;
+
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_put(vcpu);
+       else
+               vgic_v3_put(vcpu);
+}
+
+void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu)
+{
+       if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+               return;
+
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_vmcr_sync(vcpu);
+       else
+               vgic_v3_vmcr_sync(vcpu);
+}
+
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq;
+       bool pending = false;
+       unsigned long flags;
+       struct vgic_vmcr vmcr;
+
+       if (!vcpu->kvm->arch.vgic.enabled)
+               return false;
+
+       if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last)
+               return true;
+
+       vgic_get_vmcr(vcpu, &vmcr);
+
+       raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+
+       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+               raw_spin_lock(&irq->irq_lock);
+               pending = irq_is_pending(irq) && irq->enabled &&
+                         !irq->active &&
+                         irq->priority < vmcr.pmr;
+               raw_spin_unlock(&irq->irq_lock);
+
+               if (pending)
+                       break;
+       }
+
+       raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+
+       return pending;
+}
+
+void vgic_kick_vcpus(struct kvm *kvm)
+{
+       struct kvm_vcpu *vcpu;
+       int c;
+
+       /*
+        * We've injected an interrupt, time to find out who deserves
+        * a good kick...
+        */
+       kvm_for_each_vcpu(c, vcpu, kvm) {
+               if (kvm_vgic_vcpu_pending_irq(vcpu)) {
+                       kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+                       kvm_vcpu_kick(vcpu);
+               }
+       }
+}
+
+bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
+{
+       struct vgic_irq *irq;
+       bool map_is_active;
+       unsigned long flags;
+
+       if (!vgic_initialized(vcpu->kvm))
+               return false;
+
+       irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       map_is_active = irq->hw && irq->active;
+       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+       vgic_put_irq(vcpu->kvm, irq);
+
+       return map_is_active;
+}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
new file mode 100644 (file)
index 0000000..769e480
--- /dev/null
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+#ifndef __KVM_ARM_VGIC_NEW_H__
+#define __KVM_ARM_VGIC_NEW_H__
+
+#include <linux/irqchip/arm-gic-common.h>
+
+#define PRODUCT_ID_KVM         0x4b    /* ASCII code K */
+#define IMPLEMENTER_ARM                0x43b
+
+#define VGIC_ADDR_UNDEF                (-1)
+#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
+
+#define INTERRUPT_ID_BITS_SPIS 10
+#define INTERRUPT_ID_BITS_ITS  16
+#define VGIC_PRI_BITS          5
+
+#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
+
+#define VGIC_AFFINITY_0_SHIFT 0
+#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT)
+#define VGIC_AFFINITY_1_SHIFT 8
+#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT)
+#define VGIC_AFFINITY_2_SHIFT 16
+#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT)
+#define VGIC_AFFINITY_3_SHIFT 24
+#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT)
+
+#define VGIC_AFFINITY_LEVEL(reg, level) \
+       ((((reg) & VGIC_AFFINITY_## level ##_MASK) \
+       >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
+
+/*
+ * The Userspace encodes the affinity differently from the MPIDR,
+ * Below macro converts vgic userspace format to MPIDR reg format.
+ */
+#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \
+                           VGIC_AFFINITY_LEVEL(val, 1) | \
+                           VGIC_AFFINITY_LEVEL(val, 2) | \
+                           VGIC_AFFINITY_LEVEL(val, 3))
+
+/*
+ * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt,
+ * below macros are defined for CPUREG encoding.
+ */
+#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK   0x000000000000c000
+#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT  14
+#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK   0x0000000000003800
+#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT  11
+#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK   0x0000000000000780
+#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT  7
+#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK   0x0000000000000078
+#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT  3
+#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK   0x0000000000000007
+#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT  0
+
+#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \
+                                     KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \
+                                     KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \
+                                     KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \
+                                     KVM_REG_ARM_VGIC_SYSREG_OP2_MASK)
+
+/*
+ * As per Documentation/virt/kvm/devices/arm-vgic-its.txt,
+ * below macros are defined for ITS table entry encoding.
+ */
+#define KVM_ITS_CTE_VALID_SHIFT                63
+#define KVM_ITS_CTE_VALID_MASK         BIT_ULL(63)
+#define KVM_ITS_CTE_RDBASE_SHIFT       16
+#define KVM_ITS_CTE_ICID_MASK          GENMASK_ULL(15, 0)
+#define KVM_ITS_ITE_NEXT_SHIFT         48
+#define KVM_ITS_ITE_PINTID_SHIFT       16
+#define KVM_ITS_ITE_PINTID_MASK                GENMASK_ULL(47, 16)
+#define KVM_ITS_ITE_ICID_MASK          GENMASK_ULL(15, 0)
+#define KVM_ITS_DTE_VALID_SHIFT                63
+#define KVM_ITS_DTE_VALID_MASK         BIT_ULL(63)
+#define KVM_ITS_DTE_NEXT_SHIFT         49
+#define KVM_ITS_DTE_NEXT_MASK          GENMASK_ULL(62, 49)
+#define KVM_ITS_DTE_ITTADDR_SHIFT      5
+#define KVM_ITS_DTE_ITTADDR_MASK       GENMASK_ULL(48, 5)
+#define KVM_ITS_DTE_SIZE_MASK          GENMASK_ULL(4, 0)
+#define KVM_ITS_L1E_VALID_MASK         BIT_ULL(63)
+/* we only support 64 kB translation table page size */
+#define KVM_ITS_L1E_ADDR_MASK          GENMASK_ULL(51, 16)
+
+#define KVM_VGIC_V3_RDIST_INDEX_MASK   GENMASK_ULL(11, 0)
+#define KVM_VGIC_V3_RDIST_FLAGS_MASK   GENMASK_ULL(15, 12)
+#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT  12
+#define KVM_VGIC_V3_RDIST_BASE_MASK    GENMASK_ULL(51, 16)
+#define KVM_VGIC_V3_RDIST_COUNT_MASK   GENMASK_ULL(63, 52)
+#define KVM_VGIC_V3_RDIST_COUNT_SHIFT  52
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p)
+#else
+#define DEBUG_SPINLOCK_BUG_ON(p)
+#endif
+
+/* Requires the irq_lock to be held by the caller. */
+static inline bool irq_is_pending(struct vgic_irq *irq)
+{
+       if (irq->config == VGIC_CONFIG_EDGE)
+               return irq->pending_latch;
+       else
+               return irq->pending_latch || irq->line_level;
+}
+
+static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq)
+{
+       return irq->config == VGIC_CONFIG_LEVEL && irq->hw;
+}
+
+static inline int vgic_irq_get_lr_count(struct vgic_irq *irq)
+{
+       /* Account for the active state as an interrupt */
+       if (vgic_irq_is_sgi(irq->intid) && irq->source)
+               return hweight8(irq->source) + irq->active;
+
+       return irq_is_pending(irq) || irq->active;
+}
+
+static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq)
+{
+       return vgic_irq_get_lr_count(irq) > 1;
+}
+
+/*
+ * This struct provides an intermediate representation of the fields contained
+ * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
+ * state to userspace can generate either GICv2 or GICv3 CPU interface
+ * registers regardless of the hardware backed GIC used.
+ */
+struct vgic_vmcr {
+       u32     grpen0;
+       u32     grpen1;
+
+       u32     ackctl;
+       u32     fiqen;
+       u32     cbpr;
+       u32     eoim;
+
+       u32     abpr;
+       u32     bpr;
+       u32     pmr;  /* Priority mask field in the GICC_PMR and
+                      * ICC_PMR_EL1 priority field format */
+};
+
+struct vgic_reg_attr {
+       struct kvm_vcpu *vcpu;
+       gpa_t addr;
+};
+
+int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+                      struct vgic_reg_attr *reg_attr);
+int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+                      struct vgic_reg_attr *reg_attr);
+const struct vgic_register_region *
+vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
+                    gpa_t addr, int len);
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+                             u32 intid);
+void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq);
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
+bool vgic_get_phys_line_level(struct vgic_irq *irq);
+void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
+void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
+bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
+                          unsigned long flags);
+void vgic_kick_vcpus(struct kvm *kvm);
+
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+                     phys_addr_t addr, phys_addr_t alignment);
+
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v2_set_npie(struct kvm_vcpu *vcpu);
+int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
+int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                        int offset, u32 *val);
+int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                         int offset, u32 *val);
+void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v2_enable(struct kvm_vcpu *vcpu);
+int vgic_v2_probe(const struct gic_kvm_info *info);
+int vgic_v2_map_resources(struct kvm *kvm);
+int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
+                            enum vgic_type);
+
+void vgic_v2_init_lrs(void);
+void vgic_v2_load(struct kvm_vcpu *vcpu);
+void vgic_v2_put(struct kvm_vcpu *vcpu);
+void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu);
+
+void vgic_v2_save_state(struct kvm_vcpu *vcpu);
+void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
+
+static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+{
+       if (irq->intid < VGIC_MIN_LPI)
+               return;
+
+       kref_get(&irq->refcount);
+}
+
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v3_set_npie(struct kvm_vcpu *vcpu);
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v3_enable(struct kvm_vcpu *vcpu);
+int vgic_v3_probe(const struct gic_kvm_info *info);
+int vgic_v3_map_resources(struct kvm *kvm);
+int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
+int vgic_v3_save_pending_tables(struct kvm *kvm);
+int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count);
+int vgic_register_redist_iodev(struct kvm_vcpu *vcpu);
+bool vgic_v3_check_base(struct kvm *kvm);
+
+void vgic_v3_load(struct kvm_vcpu *vcpu);
+void vgic_v3_put(struct kvm_vcpu *vcpu);
+void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu);
+
+bool vgic_has_its(struct kvm *kvm);
+int kvm_vgic_register_its_device(void);
+void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu);
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
+int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
+int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                        int offset, u32 *val);
+int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                        int offset, u32 *val);
+int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                        u64 id, u64 *val);
+int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
+                               u64 *reg);
+int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                                   u32 intid, u64 *val);
+int kvm_register_vgic_device(unsigned long type);
+void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+int vgic_lazy_init(struct kvm *kvm);
+int vgic_init(struct kvm *kvm);
+
+void vgic_debug_init(struct kvm *kvm);
+void vgic_debug_destroy(struct kvm *kvm);
+
+bool lock_all_vcpus(struct kvm *kvm);
+void unlock_all_vcpus(struct kvm *kvm);
+
+static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu;
+
+       /*
+        * num_pri_bits are initialized with HW supported values.
+        * We can rely safely on num_pri_bits even if VM has not
+        * restored ICC_CTLR_EL1 before restoring APnR registers.
+        */
+       switch (cpu_if->num_pri_bits) {
+       case 7: return 3;
+       case 6: return 1;
+       default: return 0;
+       }
+}
+
+static inline bool
+vgic_v3_redist_region_full(struct vgic_redist_region *region)
+{
+       if (!region->count)
+               return false;
+
+       return (region->free_index >= region->count);
+}
+
+struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs);
+
+static inline size_t
+vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
+{
+       if (!rdreg->count)
+               return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE;
+       else
+               return rdreg->count * KVM_VGIC_V3_REDIST_SIZE;
+}
+
+struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
+                                                          u32 index);
+
+bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
+
+static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size)
+{
+       struct vgic_dist *d = &kvm->arch.vgic;
+
+       return (base + size > d->vgic_dist_base) &&
+               (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE);
+}
+
+int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr);
+int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
+                        u32 devid, u32 eventid, struct vgic_irq **irq);
+struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi);
+int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi);
+void vgic_lpi_translation_cache_init(struct kvm *kvm);
+void vgic_lpi_translation_cache_destroy(struct kvm *kvm);
+void vgic_its_invalidate_cache(struct kvm *kvm);
+
+bool vgic_supports_direct_msis(struct kvm *kvm);
+int vgic_v4_init(struct kvm *kvm);
+void vgic_v4_teardown(struct kvm *kvm);
+void vgic_v4_configure_vsgis(struct kvm *kvm);
+
+#endif
diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c
deleted file mode 100644 (file)
index 0a356aa..0000000
+++ /dev/null
@@ -1,204 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * (not much of an) Emulation layer for 32bit guests.
- *
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * based on arch/arm/kvm/emulate.c
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/bits.h>
-#include <linux/kvm_host.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-
-#define DFSR_FSC_EXTABT_LPAE   0x10
-#define DFSR_FSC_EXTABT_nLPAE  0x08
-#define DFSR_LPAE              BIT(9)
-
-/*
- * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
- */
-static const u8 return_offsets[8][2] = {
-       [0] = { 0, 0 },         /* Reset, unused */
-       [1] = { 4, 2 },         /* Undefined */
-       [2] = { 0, 0 },         /* SVC, unused */
-       [3] = { 4, 4 },         /* Prefetch abort */
-       [4] = { 8, 8 },         /* Data abort */
-       [5] = { 0, 0 },         /* HVC, unused */
-       [6] = { 4, 4 },         /* IRQ, unused */
-       [7] = { 4, 4 },         /* FIQ, unused */
-};
-
-/*
- * When an exception is taken, most CPSR fields are left unchanged in the
- * handler. However, some are explicitly overridden (e.g. M[4:0]).
- *
- * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with
- * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was
- * obsoleted by the ARMv7 virtualization extensions and is RES0.
- *
- * For the SPSR layout seen from AArch32, see:
- * - ARM DDI 0406C.d, page B1-1148
- * - ARM DDI 0487E.a, page G8-6264
- *
- * For the SPSR_ELx layout for AArch32 seen from AArch64, see:
- * - ARM DDI 0487E.a, page C5-426
- *
- * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from
- * MSB to LSB.
- */
-static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode)
-{
-       u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
-       unsigned long old, new;
-
-       old = *vcpu_cpsr(vcpu);
-       new = 0;
-
-       new |= (old & PSR_AA32_N_BIT);
-       new |= (old & PSR_AA32_Z_BIT);
-       new |= (old & PSR_AA32_C_BIT);
-       new |= (old & PSR_AA32_V_BIT);
-       new |= (old & PSR_AA32_Q_BIT);
-
-       // CPSR.IT[7:0] are set to zero upon any exception
-       // See ARM DDI 0487E.a, section G1.12.3
-       // See ARM DDI 0406C.d, section B1.8.3
-
-       new |= (old & PSR_AA32_DIT_BIT);
-
-       // CPSR.SSBS is set to SCTLR.DSSBS upon any exception
-       // See ARM DDI 0487E.a, page G8-6244
-       if (sctlr & BIT(31))
-               new |= PSR_AA32_SSBS_BIT;
-
-       // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0
-       // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented
-       // See ARM DDI 0487E.a, page G8-6246
-       new |= (old & PSR_AA32_PAN_BIT);
-       if (!(sctlr & BIT(23)))
-               new |= PSR_AA32_PAN_BIT;
-
-       // SS does not exist in AArch32, so ignore
-
-       // CPSR.IL is set to zero upon any exception
-       // See ARM DDI 0487E.a, page G1-5527
-
-       new |= (old & PSR_AA32_GE_MASK);
-
-       // CPSR.IT[7:0] are set to zero upon any exception
-       // See prior comment above
-
-       // CPSR.E is set to SCTLR.EE upon any exception
-       // See ARM DDI 0487E.a, page G8-6245
-       // See ARM DDI 0406C.d, page B4-1701
-       if (sctlr & BIT(25))
-               new |= PSR_AA32_E_BIT;
-
-       // CPSR.A is unchanged upon an exception to Undefined, Supervisor
-       // CPSR.A is set upon an exception to other modes
-       // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
-       // See ARM DDI 0406C.d, page B1-1182
-       new |= (old & PSR_AA32_A_BIT);
-       if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC)
-               new |= PSR_AA32_A_BIT;
-
-       // CPSR.I is set upon any exception
-       // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
-       // See ARM DDI 0406C.d, page B1-1182
-       new |= PSR_AA32_I_BIT;
-
-       // CPSR.F is set upon an exception to FIQ
-       // CPSR.F is unchanged upon an exception to other modes
-       // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
-       // See ARM DDI 0406C.d, page B1-1182
-       new |= (old & PSR_AA32_F_BIT);
-       if (mode == PSR_AA32_MODE_FIQ)
-               new |= PSR_AA32_F_BIT;
-
-       // CPSR.T is set to SCTLR.TE upon any exception
-       // See ARM DDI 0487E.a, page G8-5514
-       // See ARM DDI 0406C.d, page B1-1181
-       if (sctlr & BIT(30))
-               new |= PSR_AA32_T_BIT;
-
-       new |= mode;
-
-       return new;
-}
-
-static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
-{
-       unsigned long spsr = *vcpu_cpsr(vcpu);
-       bool is_thumb = (spsr & PSR_AA32_T_BIT);
-       u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
-       u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
-
-       *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode);
-
-       /* Note: These now point to the banked copies */
-       vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr));
-       *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
-
-       /* Branch to exception vector */
-       if (sctlr & (1 << 13))
-               vect_offset += 0xffff0000;
-       else /* always have security exceptions */
-               vect_offset += vcpu_cp15(vcpu, c12_VBAR);
-
-       *vcpu_pc(vcpu) = vect_offset;
-}
-
-void kvm_inject_undef32(struct kvm_vcpu *vcpu)
-{
-       prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4);
-}
-
-/*
- * Modelled after TakeDataAbortException() and TakePrefetchAbortException
- * pseudocode.
- */
-static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
-                        unsigned long addr)
-{
-       u32 vect_offset;
-       u32 *far, *fsr;
-       bool is_lpae;
-
-       if (is_pabt) {
-               vect_offset = 12;
-               far = &vcpu_cp15(vcpu, c6_IFAR);
-               fsr = &vcpu_cp15(vcpu, c5_IFSR);
-       } else { /* !iabt */
-               vect_offset = 16;
-               far = &vcpu_cp15(vcpu, c6_DFAR);
-               fsr = &vcpu_cp15(vcpu, c5_DFSR);
-       }
-
-       prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset);
-
-       *far = addr;
-
-       /* Give the guest an IMPLEMENTATION DEFINED exception */
-       is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
-       if (is_lpae) {
-               *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE;
-       } else {
-               /* no need to shuffle FS[4] into DFSR[10] as its 0 */
-               *fsr = DFSR_FSC_EXTABT_nLPAE;
-       }
-}
-
-void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr)
-{
-       inject_abt32(vcpu, false, addr);
-}
-
-void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr)
-{
-       inject_abt32(vcpu, true, addr);
-}
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
deleted file mode 100644 (file)
index 93bd59b..0000000
+++ /dev/null
@@ -1,1180 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/uaccess.h>
-
-#include <clocksource/arm_arch_timer.h>
-#include <asm/arch_timer.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-
-#include <kvm/arm_vgic.h>
-#include <kvm/arm_arch_timer.h>
-
-#include "trace.h"
-
-static struct timecounter *timecounter;
-static unsigned int host_vtimer_irq;
-static unsigned int host_ptimer_irq;
-static u32 host_vtimer_irq_flags;
-static u32 host_ptimer_irq_flags;
-
-static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
-
-static const struct kvm_irq_level default_ptimer_irq = {
-       .irq    = 30,
-       .level  = 1,
-};
-
-static const struct kvm_irq_level default_vtimer_irq = {
-       .irq    = 27,
-       .level  = 1,
-};
-
-static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
-static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
-                                struct arch_timer_context *timer_ctx);
-static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
-static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
-                               struct arch_timer_context *timer,
-                               enum kvm_arch_timer_regs treg,
-                               u64 val);
-static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
-                             struct arch_timer_context *timer,
-                             enum kvm_arch_timer_regs treg);
-
-u64 kvm_phys_timer_read(void)
-{
-       return timecounter->cc->read(timecounter->cc);
-}
-
-static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
-{
-       if (has_vhe()) {
-               map->direct_vtimer = vcpu_vtimer(vcpu);
-               map->direct_ptimer = vcpu_ptimer(vcpu);
-               map->emul_ptimer = NULL;
-       } else {
-               map->direct_vtimer = vcpu_vtimer(vcpu);
-               map->direct_ptimer = NULL;
-               map->emul_ptimer = vcpu_ptimer(vcpu);
-       }
-
-       trace_kvm_get_timer_map(vcpu->vcpu_id, map);
-}
-
-static inline bool userspace_irqchip(struct kvm *kvm)
-{
-       return static_branch_unlikely(&userspace_irqchip_in_use) &&
-               unlikely(!irqchip_in_kernel(kvm));
-}
-
-static void soft_timer_start(struct hrtimer *hrt, u64 ns)
-{
-       hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
-                     HRTIMER_MODE_ABS_HARD);
-}
-
-static void soft_timer_cancel(struct hrtimer *hrt)
-{
-       hrtimer_cancel(hrt);
-}
-
-static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
-{
-       struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
-       struct arch_timer_context *ctx;
-       struct timer_map map;
-
-       /*
-        * We may see a timer interrupt after vcpu_put() has been called which
-        * sets the CPU's vcpu pointer to NULL, because even though the timer
-        * has been disabled in timer_save_state(), the hardware interrupt
-        * signal may not have been retired from the interrupt controller yet.
-        */
-       if (!vcpu)
-               return IRQ_HANDLED;
-
-       get_timer_map(vcpu, &map);
-
-       if (irq == host_vtimer_irq)
-               ctx = map.direct_vtimer;
-       else
-               ctx = map.direct_ptimer;
-
-       if (kvm_timer_should_fire(ctx))
-               kvm_timer_update_irq(vcpu, true, ctx);
-
-       if (userspace_irqchip(vcpu->kvm) &&
-           !static_branch_unlikely(&has_gic_active_state))
-               disable_percpu_irq(host_vtimer_irq);
-
-       return IRQ_HANDLED;
-}
-
-static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
-{
-       u64 cval, now;
-
-       cval = timer_ctx->cnt_cval;
-       now = kvm_phys_timer_read() - timer_ctx->cntvoff;
-
-       if (now < cval) {
-               u64 ns;
-
-               ns = cyclecounter_cyc2ns(timecounter->cc,
-                                        cval - now,
-                                        timecounter->mask,
-                                        &timecounter->frac);
-               return ns;
-       }
-
-       return 0;
-}
-
-static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
-{
-       WARN_ON(timer_ctx && timer_ctx->loaded);
-       return timer_ctx &&
-              !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
-               (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE);
-}
-
-/*
- * Returns the earliest expiration time in ns among guest timers.
- * Note that it will return 0 if none of timers can fire.
- */
-static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
-{
-       u64 min_delta = ULLONG_MAX;
-       int i;
-
-       for (i = 0; i < NR_KVM_TIMERS; i++) {
-               struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i];
-
-               WARN(ctx->loaded, "timer %d loaded\n", i);
-               if (kvm_timer_irq_can_fire(ctx))
-                       min_delta = min(min_delta, kvm_timer_compute_delta(ctx));
-       }
-
-       /* If none of timers can fire, then return 0 */
-       if (min_delta == ULLONG_MAX)
-               return 0;
-
-       return min_delta;
-}
-
-static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
-{
-       struct arch_timer_cpu *timer;
-       struct kvm_vcpu *vcpu;
-       u64 ns;
-
-       timer = container_of(hrt, struct arch_timer_cpu, bg_timer);
-       vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);
-
-       /*
-        * Check that the timer has really expired from the guest's
-        * PoV (NTP on the host may have forced it to expire
-        * early). If we should have slept longer, restart it.
-        */
-       ns = kvm_timer_earliest_exp(vcpu);
-       if (unlikely(ns)) {
-               hrtimer_forward_now(hrt, ns_to_ktime(ns));
-               return HRTIMER_RESTART;
-       }
-
-       kvm_vcpu_wake_up(vcpu);
-       return HRTIMER_NORESTART;
-}
-
-static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt)
-{
-       struct arch_timer_context *ctx;
-       struct kvm_vcpu *vcpu;
-       u64 ns;
-
-       ctx = container_of(hrt, struct arch_timer_context, hrtimer);
-       vcpu = ctx->vcpu;
-
-       trace_kvm_timer_hrtimer_expire(ctx);
-
-       /*
-        * Check that the timer has really expired from the guest's
-        * PoV (NTP on the host may have forced it to expire
-        * early). If not ready, schedule for a later time.
-        */
-       ns = kvm_timer_compute_delta(ctx);
-       if (unlikely(ns)) {
-               hrtimer_forward_now(hrt, ns_to_ktime(ns));
-               return HRTIMER_RESTART;
-       }
-
-       kvm_timer_update_irq(vcpu, true, ctx);
-       return HRTIMER_NORESTART;
-}
-
-static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
-{
-       enum kvm_arch_timers index;
-       u64 cval, now;
-
-       if (!timer_ctx)
-               return false;
-
-       index = arch_timer_ctx_index(timer_ctx);
-
-       if (timer_ctx->loaded) {
-               u32 cnt_ctl = 0;
-
-               switch (index) {
-               case TIMER_VTIMER:
-                       cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
-                       break;
-               case TIMER_PTIMER:
-                       cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
-                       break;
-               case NR_KVM_TIMERS:
-                       /* GCC is braindead */
-                       cnt_ctl = 0;
-                       break;
-               }
-
-               return  (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
-                       (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
-                      !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
-       }
-
-       if (!kvm_timer_irq_can_fire(timer_ctx))
-               return false;
-
-       cval = timer_ctx->cnt_cval;
-       now = kvm_phys_timer_read() - timer_ctx->cntvoff;
-
-       return cval <= now;
-}
-
-bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
-{
-       struct timer_map map;
-
-       get_timer_map(vcpu, &map);
-
-       return kvm_timer_should_fire(map.direct_vtimer) ||
-              kvm_timer_should_fire(map.direct_ptimer) ||
-              kvm_timer_should_fire(map.emul_ptimer);
-}
-
-/*
- * Reflect the timer output level into the kvm_run structure
- */
-void kvm_timer_update_run(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-       struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-       struct kvm_sync_regs *regs = &vcpu->run->s.regs;
-
-       /* Populate the device bitmap with the timer states */
-       regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER |
-                                   KVM_ARM_DEV_EL1_PTIMER);
-       if (kvm_timer_should_fire(vtimer))
-               regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER;
-       if (kvm_timer_should_fire(ptimer))
-               regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
-}
-
-static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
-                                struct arch_timer_context *timer_ctx)
-{
-       int ret;
-
-       timer_ctx->irq.level = new_level;
-       trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
-                                  timer_ctx->irq.level);
-
-       if (!userspace_irqchip(vcpu->kvm)) {
-               ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-                                         timer_ctx->irq.irq,
-                                         timer_ctx->irq.level,
-                                         timer_ctx);
-               WARN_ON(ret);
-       }
-}
-
-/* Only called for a fully emulated timer */
-static void timer_emulate(struct arch_timer_context *ctx)
-{
-       bool should_fire = kvm_timer_should_fire(ctx);
-
-       trace_kvm_timer_emulate(ctx, should_fire);
-
-       if (should_fire != ctx->irq.level) {
-               kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
-               return;
-       }
-
-       /*
-        * If the timer can fire now, we don't need to have a soft timer
-        * scheduled for the future.  If the timer cannot fire at all,
-        * then we also don't need a soft timer.
-        */
-       if (!kvm_timer_irq_can_fire(ctx)) {
-               soft_timer_cancel(&ctx->hrtimer);
-               return;
-       }
-
-       soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx));
-}
-
-static void timer_save_state(struct arch_timer_context *ctx)
-{
-       struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
-       enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
-       unsigned long flags;
-
-       if (!timer->enabled)
-               return;
-
-       local_irq_save(flags);
-
-       if (!ctx->loaded)
-               goto out;
-
-       switch (index) {
-       case TIMER_VTIMER:
-               ctx->cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
-               ctx->cnt_cval = read_sysreg_el0(SYS_CNTV_CVAL);
-
-               /* Disable the timer */
-               write_sysreg_el0(0, SYS_CNTV_CTL);
-               isb();
-
-               break;
-       case TIMER_PTIMER:
-               ctx->cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
-               ctx->cnt_cval = read_sysreg_el0(SYS_CNTP_CVAL);
-
-               /* Disable the timer */
-               write_sysreg_el0(0, SYS_CNTP_CTL);
-               isb();
-
-               break;
-       case NR_KVM_TIMERS:
-               BUG();
-       }
-
-       trace_kvm_timer_save_state(ctx);
-
-       ctx->loaded = false;
-out:
-       local_irq_restore(flags);
-}
-
-/*
- * Schedule the background timer before calling kvm_vcpu_block, so that this
- * thread is removed from its waitqueue and made runnable when there's a timer
- * interrupt to handle.
- */
-static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-       struct timer_map map;
-
-       get_timer_map(vcpu, &map);
-
-       /*
-        * If no timers are capable of raising interrupts (disabled or
-        * masked), then there's no more work for us to do.
-        */
-       if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
-           !kvm_timer_irq_can_fire(map.direct_ptimer) &&
-           !kvm_timer_irq_can_fire(map.emul_ptimer))
-               return;
-
-       /*
-        * At least one guest time will expire. Schedule a background timer.
-        * Set the earliest expiration time among the guest timers.
-        */
-       soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
-}
-
-static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-
-       soft_timer_cancel(&timer->bg_timer);
-}
-
-static void timer_restore_state(struct arch_timer_context *ctx)
-{
-       struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
-       enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
-       unsigned long flags;
-
-       if (!timer->enabled)
-               return;
-
-       local_irq_save(flags);
-
-       if (ctx->loaded)
-               goto out;
-
-       switch (index) {
-       case TIMER_VTIMER:
-               write_sysreg_el0(ctx->cnt_cval, SYS_CNTV_CVAL);
-               isb();
-               write_sysreg_el0(ctx->cnt_ctl, SYS_CNTV_CTL);
-               break;
-       case TIMER_PTIMER:
-               write_sysreg_el0(ctx->cnt_cval, SYS_CNTP_CVAL);
-               isb();
-               write_sysreg_el0(ctx->cnt_ctl, SYS_CNTP_CTL);
-               break;
-       case NR_KVM_TIMERS:
-               BUG();
-       }
-
-       trace_kvm_timer_restore_state(ctx);
-
-       ctx->loaded = true;
-out:
-       local_irq_restore(flags);
-}
-
-static void set_cntvoff(u64 cntvoff)
-{
-       u32 low = lower_32_bits(cntvoff);
-       u32 high = upper_32_bits(cntvoff);
-
-       /*
-        * Since kvm_call_hyp doesn't fully support the ARM PCS especially on
-        * 32-bit systems, but rather passes register by register shifted one
-        * place (we put the function address in r0/x0), we cannot simply pass
-        * a 64-bit value as an argument, but have to split the value in two
-        * 32-bit halves.
-        */
-       kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
-}
-
-static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
-{
-       int r;
-       r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active);
-       WARN_ON(r);
-}
-
-static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
-{
-       struct kvm_vcpu *vcpu = ctx->vcpu;
-       bool phys_active = false;
-
-       /*
-        * Update the timer output so that it is likely to match the
-        * state we're about to restore. If the timer expires between
-        * this point and the register restoration, we'll take the
-        * interrupt anyway.
-        */
-       kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);
-
-       if (irqchip_in_kernel(vcpu->kvm))
-               phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq);
-
-       phys_active |= ctx->irq.level;
-
-       set_timer_irq_phys_active(ctx, phys_active);
-}
-
-static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
-       /*
-        * Update the timer output so that it is likely to match the
-        * state we're about to restore. If the timer expires between
-        * this point and the register restoration, we'll take the
-        * interrupt anyway.
-        */
-       kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer);
-
-       /*
-        * When using a userspace irqchip with the architected timers and a
-        * host interrupt controller that doesn't support an active state, we
-        * must still prevent continuously exiting from the guest, and
-        * therefore mask the physical interrupt by disabling it on the host
-        * interrupt controller when the virtual level is high, such that the
-        * guest can make forward progress.  Once we detect the output level
-        * being de-asserted, we unmask the interrupt again so that we exit
-        * from the guest when the timer fires.
-        */
-       if (vtimer->irq.level)
-               disable_percpu_irq(host_vtimer_irq);
-       else
-               enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
-}
-
-void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-       struct timer_map map;
-
-       if (unlikely(!timer->enabled))
-               return;
-
-       get_timer_map(vcpu, &map);
-
-       if (static_branch_likely(&has_gic_active_state)) {
-               kvm_timer_vcpu_load_gic(map.direct_vtimer);
-               if (map.direct_ptimer)
-                       kvm_timer_vcpu_load_gic(map.direct_ptimer);
-       } else {
-               kvm_timer_vcpu_load_nogic(vcpu);
-       }
-
-       set_cntvoff(map.direct_vtimer->cntvoff);
-
-       kvm_timer_unblocking(vcpu);
-
-       timer_restore_state(map.direct_vtimer);
-       if (map.direct_ptimer)
-               timer_restore_state(map.direct_ptimer);
-
-       if (map.emul_ptimer)
-               timer_emulate(map.emul_ptimer);
-}
-
-bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-       struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-       struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
-       bool vlevel, plevel;
-
-       if (likely(irqchip_in_kernel(vcpu->kvm)))
-               return false;
-
-       vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER;
-       plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER;
-
-       return kvm_timer_should_fire(vtimer) != vlevel ||
-              kvm_timer_should_fire(ptimer) != plevel;
-}
-
-void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-       struct timer_map map;
-
-       if (unlikely(!timer->enabled))
-               return;
-
-       get_timer_map(vcpu, &map);
-
-       timer_save_state(map.direct_vtimer);
-       if (map.direct_ptimer)
-               timer_save_state(map.direct_ptimer);
-
-       /*
-        * Cancel soft timer emulation, because the only case where we
-        * need it after a vcpu_put is in the context of a sleeping VCPU, and
-        * in that case we already factor in the deadline for the physical
-        * timer when scheduling the bg_timer.
-        *
-        * In any case, we re-schedule the hrtimer for the physical timer when
-        * coming back to the VCPU thread in kvm_timer_vcpu_load().
-        */
-       if (map.emul_ptimer)
-               soft_timer_cancel(&map.emul_ptimer->hrtimer);
-
-       if (swait_active(kvm_arch_vcpu_wq(vcpu)))
-               kvm_timer_blocking(vcpu);
-
-       /*
-        * The kernel may decide to run userspace after calling vcpu_put, so
-        * we reset cntvoff to 0 to ensure a consistent read between user
-        * accesses to the virtual counter and kernel access to the physical
-        * counter of non-VHE case. For VHE, the virtual counter uses a fixed
-        * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
-        */
-       set_cntvoff(0);
-}
-
-/*
- * With a userspace irqchip we have to check if the guest de-asserted the
- * timer and if so, unmask the timer irq signal on the host interrupt
- * controller to ensure that we see future timer signals.
- */
-static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
-       if (!kvm_timer_should_fire(vtimer)) {
-               kvm_timer_update_irq(vcpu, false, vtimer);
-               if (static_branch_likely(&has_gic_active_state))
-                       set_timer_irq_phys_active(vtimer, false);
-               else
-                       enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
-       }
-}
-
-void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-
-       if (unlikely(!timer->enabled))
-               return;
-
-       if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
-               unmask_vtimer_irq_user(vcpu);
-}
-
-int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-       struct timer_map map;
-
-       get_timer_map(vcpu, &map);
-
-       /*
-        * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
-        * and to 0 for ARMv7.  We provide an implementation that always
-        * resets the timer to be disabled and unmasked and is compliant with
-        * the ARMv7 architecture.
-        */
-       vcpu_vtimer(vcpu)->cnt_ctl = 0;
-       vcpu_ptimer(vcpu)->cnt_ctl = 0;
-
-       if (timer->enabled) {
-               kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu));
-               kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu));
-
-               if (irqchip_in_kernel(vcpu->kvm)) {
-                       kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
-                       if (map.direct_ptimer)
-                               kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq);
-               }
-       }
-
-       if (map.emul_ptimer)
-               soft_timer_cancel(&map.emul_ptimer->hrtimer);
-
-       return 0;
-}
-
-/* Make the updates of cntvoff for all vtimer contexts atomic */
-static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
-{
-       int i;
-       struct kvm *kvm = vcpu->kvm;
-       struct kvm_vcpu *tmp;
-
-       mutex_lock(&kvm->lock);
-       kvm_for_each_vcpu(i, tmp, kvm)
-               vcpu_vtimer(tmp)->cntvoff = cntvoff;
-
-       /*
-        * When called from the vcpu create path, the CPU being created is not
-        * included in the loop above, so we just set it here as well.
-        */
-       vcpu_vtimer(vcpu)->cntvoff = cntvoff;
-       mutex_unlock(&kvm->lock);
-}
-
-void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-       struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
-       /* Synchronize cntvoff across all vtimers of a VM. */
-       update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
-       ptimer->cntvoff = 0;
-
-       hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
-       timer->bg_timer.function = kvm_bg_timer_expire;
-
-       hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
-       hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
-       vtimer->hrtimer.function = kvm_hrtimer_expire;
-       ptimer->hrtimer.function = kvm_hrtimer_expire;
-
-       vtimer->irq.irq = default_vtimer_irq.irq;
-       ptimer->irq.irq = default_ptimer_irq.irq;
-
-       vtimer->host_timer_irq = host_vtimer_irq;
-       ptimer->host_timer_irq = host_ptimer_irq;
-
-       vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
-       ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
-
-       vtimer->vcpu = vcpu;
-       ptimer->vcpu = vcpu;
-}
-
-static void kvm_timer_init_interrupt(void *info)
-{
-       enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
-       enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
-}
-
-int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
-{
-       struct arch_timer_context *timer;
-
-       switch (regid) {
-       case KVM_REG_ARM_TIMER_CTL:
-               timer = vcpu_vtimer(vcpu);
-               kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
-               break;
-       case KVM_REG_ARM_TIMER_CNT:
-               timer = vcpu_vtimer(vcpu);
-               update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
-               break;
-       case KVM_REG_ARM_TIMER_CVAL:
-               timer = vcpu_vtimer(vcpu);
-               kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
-               break;
-       case KVM_REG_ARM_PTIMER_CTL:
-               timer = vcpu_ptimer(vcpu);
-               kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
-               break;
-       case KVM_REG_ARM_PTIMER_CVAL:
-               timer = vcpu_ptimer(vcpu);
-               kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
-               break;
-
-       default:
-               return -1;
-       }
-
-       return 0;
-}
-
-static u64 read_timer_ctl(struct arch_timer_context *timer)
-{
-       /*
-        * Set ISTATUS bit if it's expired.
-        * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
-        * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
-        * regardless of ENABLE bit for our implementation convenience.
-        */
-       if (!kvm_timer_compute_delta(timer))
-               return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT;
-       else
-               return timer->cnt_ctl;
-}
-
-u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
-{
-       switch (regid) {
-       case KVM_REG_ARM_TIMER_CTL:
-               return kvm_arm_timer_read(vcpu,
-                                         vcpu_vtimer(vcpu), TIMER_REG_CTL);
-       case KVM_REG_ARM_TIMER_CNT:
-               return kvm_arm_timer_read(vcpu,
-                                         vcpu_vtimer(vcpu), TIMER_REG_CNT);
-       case KVM_REG_ARM_TIMER_CVAL:
-               return kvm_arm_timer_read(vcpu,
-                                         vcpu_vtimer(vcpu), TIMER_REG_CVAL);
-       case KVM_REG_ARM_PTIMER_CTL:
-               return kvm_arm_timer_read(vcpu,
-                                         vcpu_ptimer(vcpu), TIMER_REG_CTL);
-       case KVM_REG_ARM_PTIMER_CNT:
-               return kvm_arm_timer_read(vcpu,
-                                         vcpu_ptimer(vcpu), TIMER_REG_CNT);
-       case KVM_REG_ARM_PTIMER_CVAL:
-               return kvm_arm_timer_read(vcpu,
-                                         vcpu_ptimer(vcpu), TIMER_REG_CVAL);
-       }
-       return (u64)-1;
-}
-
-static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
-                             struct arch_timer_context *timer,
-                             enum kvm_arch_timer_regs treg)
-{
-       u64 val;
-
-       switch (treg) {
-       case TIMER_REG_TVAL:
-               val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff;
-               val &= lower_32_bits(val);
-               break;
-
-       case TIMER_REG_CTL:
-               val = read_timer_ctl(timer);
-               break;
-
-       case TIMER_REG_CVAL:
-               val = timer->cnt_cval;
-               break;
-
-       case TIMER_REG_CNT:
-               val = kvm_phys_timer_read() - timer->cntvoff;
-               break;
-
-       default:
-               BUG();
-       }
-
-       return val;
-}
-
-u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
-                             enum kvm_arch_timers tmr,
-                             enum kvm_arch_timer_regs treg)
-{
-       u64 val;
-
-       preempt_disable();
-       kvm_timer_vcpu_put(vcpu);
-
-       val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg);
-
-       kvm_timer_vcpu_load(vcpu);
-       preempt_enable();
-
-       return val;
-}
-
-static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
-                               struct arch_timer_context *timer,
-                               enum kvm_arch_timer_regs treg,
-                               u64 val)
-{
-       switch (treg) {
-       case TIMER_REG_TVAL:
-               timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + (s32)val;
-               break;
-
-       case TIMER_REG_CTL:
-               timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT;
-               break;
-
-       case TIMER_REG_CVAL:
-               timer->cnt_cval = val;
-               break;
-
-       default:
-               BUG();
-       }
-}
-
-void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
-                               enum kvm_arch_timers tmr,
-                               enum kvm_arch_timer_regs treg,
-                               u64 val)
-{
-       preempt_disable();
-       kvm_timer_vcpu_put(vcpu);
-
-       kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
-
-       kvm_timer_vcpu_load(vcpu);
-       preempt_enable();
-}
-
-static int kvm_timer_starting_cpu(unsigned int cpu)
-{
-       kvm_timer_init_interrupt(NULL);
-       return 0;
-}
-
-static int kvm_timer_dying_cpu(unsigned int cpu)
-{
-       disable_percpu_irq(host_vtimer_irq);
-       return 0;
-}
-
-int kvm_timer_hyp_init(bool has_gic)
-{
-       struct arch_timer_kvm_info *info;
-       int err;
-
-       info = arch_timer_get_kvm_info();
-       timecounter = &info->timecounter;
-
-       if (!timecounter->cc) {
-               kvm_err("kvm_arch_timer: uninitialized timecounter\n");
-               return -ENODEV;
-       }
-
-       /* First, do the virtual EL1 timer irq */
-
-       if (info->virtual_irq <= 0) {
-               kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
-                       info->virtual_irq);
-               return -ENODEV;
-       }
-       host_vtimer_irq = info->virtual_irq;
-
-       host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
-       if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
-           host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
-               kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n",
-                       host_vtimer_irq);
-               host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
-       }
-
-       err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
-                                "kvm guest vtimer", kvm_get_running_vcpus());
-       if (err) {
-               kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n",
-                       host_vtimer_irq, err);
-               return err;
-       }
-
-       if (has_gic) {
-               err = irq_set_vcpu_affinity(host_vtimer_irq,
-                                           kvm_get_running_vcpus());
-               if (err) {
-                       kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
-                       goto out_free_irq;
-               }
-
-               static_branch_enable(&has_gic_active_state);
-       }
-
-       kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq);
-
-       /* Now let's do the physical EL1 timer irq */
-
-       if (info->physical_irq > 0) {
-               host_ptimer_irq = info->physical_irq;
-               host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq);
-               if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH &&
-                   host_ptimer_irq_flags != IRQF_TRIGGER_LOW) {
-                       kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n",
-                               host_ptimer_irq);
-                       host_ptimer_irq_flags = IRQF_TRIGGER_LOW;
-               }
-
-               err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
-                                        "kvm guest ptimer", kvm_get_running_vcpus());
-               if (err) {
-                       kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
-                               host_ptimer_irq, err);
-                       return err;
-               }
-
-               if (has_gic) {
-                       err = irq_set_vcpu_affinity(host_ptimer_irq,
-                                                   kvm_get_running_vcpus());
-                       if (err) {
-                               kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
-                               goto out_free_irq;
-                       }
-               }
-
-               kvm_debug("physical timer IRQ%d\n", host_ptimer_irq);
-       } else if (has_vhe()) {
-               kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
-                       info->physical_irq);
-               err = -ENODEV;
-               goto out_free_irq;
-       }
-
-       cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
-                         "kvm/arm/timer:starting", kvm_timer_starting_cpu,
-                         kvm_timer_dying_cpu);
-       return 0;
-out_free_irq:
-       free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
-       return err;
-}
-
-void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-
-       soft_timer_cancel(&timer->bg_timer);
-}
-
-static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
-{
-       int vtimer_irq, ptimer_irq;
-       int i, ret;
-
-       vtimer_irq = vcpu_vtimer(vcpu)->irq.irq;
-       ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
-       if (ret)
-               return false;
-
-       ptimer_irq = vcpu_ptimer(vcpu)->irq.irq;
-       ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
-       if (ret)
-               return false;
-
-       kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
-               if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq ||
-                   vcpu_ptimer(vcpu)->irq.irq != ptimer_irq)
-                       return false;
-       }
-
-       return true;
-}
-
-bool kvm_arch_timer_get_input_level(int vintid)
-{
-       struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
-       struct arch_timer_context *timer;
-
-       if (vintid == vcpu_vtimer(vcpu)->irq.irq)
-               timer = vcpu_vtimer(vcpu);
-       else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
-               timer = vcpu_ptimer(vcpu);
-       else
-               BUG();
-
-       return kvm_timer_should_fire(timer);
-}
-
-int kvm_timer_enable(struct kvm_vcpu *vcpu)
-{
-       struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-       struct timer_map map;
-       int ret;
-
-       if (timer->enabled)
-               return 0;
-
-       /* Without a VGIC we do not map virtual IRQs to physical IRQs */
-       if (!irqchip_in_kernel(vcpu->kvm))
-               goto no_vgic;
-
-       if (!vgic_initialized(vcpu->kvm))
-               return -ENODEV;
-
-       if (!timer_irqs_are_valid(vcpu)) {
-               kvm_debug("incorrectly configured timer irqs\n");
-               return -EINVAL;
-       }
-
-       get_timer_map(vcpu, &map);
-
-       ret = kvm_vgic_map_phys_irq(vcpu,
-                                   map.direct_vtimer->host_timer_irq,
-                                   map.direct_vtimer->irq.irq,
-                                   kvm_arch_timer_get_input_level);
-       if (ret)
-               return ret;
-
-       if (map.direct_ptimer) {
-               ret = kvm_vgic_map_phys_irq(vcpu,
-                                           map.direct_ptimer->host_timer_irq,
-                                           map.direct_ptimer->irq.irq,
-                                           kvm_arch_timer_get_input_level);
-       }
-
-       if (ret)
-               return ret;
-
-no_vgic:
-       timer->enabled = 1;
-       return 0;
-}
-
-/*
- * On VHE system, we only need to configure the EL2 timer trap register once,
- * not for every world switch.
- * The host kernel runs at EL2 with HCR_EL2.TGE == 1,
- * and this makes those bits have no effect for the host kernel execution.
- */
-void kvm_timer_init_vhe(void)
-{
-       /* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */
-       u32 cnthctl_shift = 10;
-       u64 val;
-
-       /*
-        * VHE systems allow the guest direct access to the EL1 physical
-        * timer/counter.
-        */
-       val = read_sysreg(cnthctl_el2);
-       val |= (CNTHCTL_EL1PCEN << cnthctl_shift);
-       val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
-       write_sysreg(val, cnthctl_el2);
-}
-
-static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
-{
-       struct kvm_vcpu *vcpu;
-       int i;
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               vcpu_vtimer(vcpu)->irq.irq = vtimer_irq;
-               vcpu_ptimer(vcpu)->irq.irq = ptimer_irq;
-       }
-}
-
-int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-       int __user *uaddr = (int __user *)(long)attr->addr;
-       struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-       struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-       int irq;
-
-       if (!irqchip_in_kernel(vcpu->kvm))
-               return -EINVAL;
-
-       if (get_user(irq, uaddr))
-               return -EFAULT;
-
-       if (!(irq_is_ppi(irq)))
-               return -EINVAL;
-
-       if (vcpu->arch.timer_cpu.enabled)
-               return -EBUSY;
-
-       switch (attr->attr) {
-       case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
-               set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq);
-               break;
-       case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
-               set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq);
-               break;
-       default:
-               return -ENXIO;
-       }
-
-       return 0;
-}
-
-int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-       int __user *uaddr = (int __user *)(long)attr->addr;
-       struct arch_timer_context *timer;
-       int irq;
-
-       switch (attr->attr) {
-       case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
-               timer = vcpu_vtimer(vcpu);
-               break;
-       case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
-               timer = vcpu_ptimer(vcpu);
-               break;
-       default:
-               return -ENXIO;
-       }
-
-       irq = timer->irq.irq;
-       return put_user(irq, uaddr);
-}
-
-int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-       switch (attr->attr) {
-       case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
-       case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
-               return 0;
-       }
-
-       return -ENXIO;
-}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
deleted file mode 100644 (file)
index 48d0ec4..0000000
+++ /dev/null
@@ -1,1681 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/bug.h>
-#include <linux/cpu_pm.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/kvm_host.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/mman.h>
-#include <linux/sched.h>
-#include <linux/kvm.h>
-#include <linux/kvm_irqfd.h>
-#include <linux/irqbypass.h>
-#include <linux/sched/stat.h>
-#include <trace/events/kvm.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-#include <linux/uaccess.h>
-#include <asm/ptrace.h>
-#include <asm/mman.h>
-#include <asm/tlbflush.h>
-#include <asm/cacheflush.h>
-#include <asm/cpufeature.h>
-#include <asm/virt.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_coproc.h>
-#include <asm/sections.h>
-
-#include <kvm/arm_hypercalls.h>
-#include <kvm/arm_pmu.h>
-#include <kvm/arm_psci.h>
-
-#ifdef REQUIRES_VIRT
-__asm__(".arch_extension       virt");
-#endif
-
-DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
-static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
-
-/* The VMID used in the VTTBR */
-static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
-static u32 kvm_next_vmid;
-static DEFINE_SPINLOCK(kvm_vmid_lock);
-
-static bool vgic_present;
-
-static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
-DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
-
-int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
-{
-       return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
-}
-
-int kvm_arch_hardware_setup(void *opaque)
-{
-       return 0;
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
-       return 0;
-}
-
-int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
-                           struct kvm_enable_cap *cap)
-{
-       int r;
-
-       if (cap->flags)
-               return -EINVAL;
-
-       switch (cap->cap) {
-       case KVM_CAP_ARM_NISV_TO_USER:
-               r = 0;
-               kvm->arch.return_nisv_io_abort_to_user = true;
-               break;
-       default:
-               r = -EINVAL;
-               break;
-       }
-
-       return r;
-}
-
-/**
- * kvm_arch_init_vm - initializes a VM data structure
- * @kvm:       pointer to the KVM struct
- */
-int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
-{
-       int ret, cpu;
-
-       ret = kvm_arm_setup_stage2(kvm, type);
-       if (ret)
-               return ret;
-
-       kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran));
-       if (!kvm->arch.last_vcpu_ran)
-               return -ENOMEM;
-
-       for_each_possible_cpu(cpu)
-               *per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1;
-
-       ret = kvm_alloc_stage2_pgd(kvm);
-       if (ret)
-               goto out_fail_alloc;
-
-       ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
-       if (ret)
-               goto out_free_stage2_pgd;
-
-       kvm_vgic_early_init(kvm);
-
-       /* Mark the initial VMID generation invalid */
-       kvm->arch.vmid.vmid_gen = 0;
-
-       /* The maximum number of VCPUs is limited by the host's GIC model */
-       kvm->arch.max_vcpus = vgic_present ?
-                               kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
-
-       return ret;
-out_free_stage2_pgd:
-       kvm_free_stage2_pgd(kvm);
-out_fail_alloc:
-       free_percpu(kvm->arch.last_vcpu_ran);
-       kvm->arch.last_vcpu_ran = NULL;
-       return ret;
-}
-
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
-{
-       return 0;
-}
-
-vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
-{
-       return VM_FAULT_SIGBUS;
-}
-
-
-/**
- * kvm_arch_destroy_vm - destroy the VM data structure
- * @kvm:       pointer to the KVM struct
- */
-void kvm_arch_destroy_vm(struct kvm *kvm)
-{
-       int i;
-
-       kvm_vgic_destroy(kvm);
-
-       free_percpu(kvm->arch.last_vcpu_ran);
-       kvm->arch.last_vcpu_ran = NULL;
-
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (kvm->vcpus[i]) {
-                       kvm_vcpu_destroy(kvm->vcpus[i]);
-                       kvm->vcpus[i] = NULL;
-               }
-       }
-       atomic_set(&kvm->online_vcpus, 0);
-}
-
-int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
-{
-       int r;
-       switch (ext) {
-       case KVM_CAP_IRQCHIP:
-               r = vgic_present;
-               break;
-       case KVM_CAP_IOEVENTFD:
-       case KVM_CAP_DEVICE_CTRL:
-       case KVM_CAP_USER_MEMORY:
-       case KVM_CAP_SYNC_MMU:
-       case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
-       case KVM_CAP_ONE_REG:
-       case KVM_CAP_ARM_PSCI:
-       case KVM_CAP_ARM_PSCI_0_2:
-       case KVM_CAP_READONLY_MEM:
-       case KVM_CAP_MP_STATE:
-       case KVM_CAP_IMMEDIATE_EXIT:
-       case KVM_CAP_VCPU_EVENTS:
-       case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
-       case KVM_CAP_ARM_NISV_TO_USER:
-       case KVM_CAP_ARM_INJECT_EXT_DABT:
-               r = 1;
-               break;
-       case KVM_CAP_ARM_SET_DEVICE_ADDR:
-               r = 1;
-               break;
-       case KVM_CAP_NR_VCPUS:
-               r = num_online_cpus();
-               break;
-       case KVM_CAP_MAX_VCPUS:
-               r = KVM_MAX_VCPUS;
-               break;
-       case KVM_CAP_MAX_VCPU_ID:
-               r = KVM_MAX_VCPU_ID;
-               break;
-       case KVM_CAP_MSI_DEVID:
-               if (!kvm)
-                       r = -EINVAL;
-               else
-                       r = kvm->arch.vgic.msis_require_devid;
-               break;
-       case KVM_CAP_ARM_USER_IRQ:
-               /*
-                * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
-                * (bump this number if adding more devices)
-                */
-               r = 1;
-               break;
-       default:
-               r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
-               break;
-       }
-       return r;
-}
-
-long kvm_arch_dev_ioctl(struct file *filp,
-                       unsigned int ioctl, unsigned long arg)
-{
-       return -EINVAL;
-}
-
-struct kvm *kvm_arch_alloc_vm(void)
-{
-       if (!has_vhe())
-               return kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
-       return vzalloc(sizeof(struct kvm));
-}
-
-void kvm_arch_free_vm(struct kvm *kvm)
-{
-       if (!has_vhe())
-               kfree(kvm);
-       else
-               vfree(kvm);
-}
-
-int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
-{
-       if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
-               return -EBUSY;
-
-       if (id >= kvm->arch.max_vcpus)
-               return -EINVAL;
-
-       return 0;
-}
-
-int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
-{
-       int err;
-
-       /* Force users to call KVM_ARM_VCPU_INIT */
-       vcpu->arch.target = -1;
-       bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
-
-       /* Set up the timer */
-       kvm_timer_vcpu_init(vcpu);
-
-       kvm_pmu_vcpu_init(vcpu);
-
-       kvm_arm_reset_debug_ptr(vcpu);
-
-       kvm_arm_pvtime_vcpu_init(&vcpu->arch);
-
-       err = kvm_vgic_vcpu_init(vcpu);
-       if (err)
-               return err;
-
-       return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
-}
-
-void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
-{
-}
-
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
-               static_branch_dec(&userspace_irqchip_in_use);
-
-       kvm_mmu_free_memory_caches(vcpu);
-       kvm_timer_vcpu_terminate(vcpu);
-       kvm_pmu_vcpu_destroy(vcpu);
-
-       kvm_arm_vcpu_destroy(vcpu);
-}
-
-int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-       return kvm_timer_is_pending(vcpu);
-}
-
-void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
-{
-       /*
-        * If we're about to block (most likely because we've just hit a
-        * WFI), we need to sync back the state of the GIC CPU interface
-        * so that we have the latest PMR and group enables. This ensures
-        * that kvm_arch_vcpu_runnable has up-to-date data to decide
-        * whether we have pending interrupts.
-        *
-        * For the same reason, we want to tell GICv4 that we need
-        * doorbells to be signalled, should an interrupt become pending.
-        */
-       preempt_disable();
-       kvm_vgic_vmcr_sync(vcpu);
-       vgic_v4_put(vcpu, true);
-       preempt_enable();
-}
-
-void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
-{
-       preempt_disable();
-       vgic_v4_load(vcpu);
-       preempt_enable();
-}
-
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-       int *last_ran;
-       kvm_host_data_t *cpu_data;
-
-       last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran);
-       cpu_data = this_cpu_ptr(&kvm_host_data);
-
-       /*
-        * We might get preempted before the vCPU actually runs, but
-        * over-invalidation doesn't affect correctness.
-        */
-       if (*last_ran != vcpu->vcpu_id) {
-               kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu);
-               *last_ran = vcpu->vcpu_id;
-       }
-
-       vcpu->cpu = cpu;
-       vcpu->arch.host_cpu_context = &cpu_data->host_ctxt;
-
-       kvm_vgic_load(vcpu);
-       kvm_timer_vcpu_load(vcpu);
-       kvm_vcpu_load_sysregs(vcpu);
-       kvm_arch_vcpu_load_fp(vcpu);
-       kvm_vcpu_pmu_restore_guest(vcpu);
-       if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
-               kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
-
-       if (single_task_running())
-               vcpu_clear_wfx_traps(vcpu);
-       else
-               vcpu_set_wfx_traps(vcpu);
-
-       vcpu_ptrauth_setup_lazy(vcpu);
-}
-
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
-       kvm_arch_vcpu_put_fp(vcpu);
-       kvm_vcpu_put_sysregs(vcpu);
-       kvm_timer_vcpu_put(vcpu);
-       kvm_vgic_put(vcpu);
-       kvm_vcpu_pmu_restore_host(vcpu);
-
-       vcpu->cpu = -1;
-}
-
-static void vcpu_power_off(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.power_off = true;
-       kvm_make_request(KVM_REQ_SLEEP, vcpu);
-       kvm_vcpu_kick(vcpu);
-}
-
-int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
-                                   struct kvm_mp_state *mp_state)
-{
-       if (vcpu->arch.power_off)
-               mp_state->mp_state = KVM_MP_STATE_STOPPED;
-       else
-               mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
-
-       return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
-                                   struct kvm_mp_state *mp_state)
-{
-       int ret = 0;
-
-       switch (mp_state->mp_state) {
-       case KVM_MP_STATE_RUNNABLE:
-               vcpu->arch.power_off = false;
-               break;
-       case KVM_MP_STATE_STOPPED:
-               vcpu_power_off(vcpu);
-               break;
-       default:
-               ret = -EINVAL;
-       }
-
-       return ret;
-}
-
-/**
- * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
- * @v:         The VCPU pointer
- *
- * If the guest CPU is not waiting for interrupts or an interrupt line is
- * asserted, the CPU is by definition runnable.
- */
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
-{
-       bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
-       return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
-               && !v->arch.power_off && !v->arch.pause);
-}
-
-bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
-{
-       return vcpu_mode_priv(vcpu);
-}
-
-/* Just ensure a guest exit from a particular CPU */
-static void exit_vm_noop(void *info)
-{
-}
-
-void force_vm_exit(const cpumask_t *mask)
-{
-       preempt_disable();
-       smp_call_function_many(mask, exit_vm_noop, NULL, true);
-       preempt_enable();
-}
-
-/**
- * need_new_vmid_gen - check that the VMID is still valid
- * @vmid: The VMID to check
- *
- * return true if there is a new generation of VMIDs being used
- *
- * The hardware supports a limited set of values with the value zero reserved
- * for the host, so we check if an assigned value belongs to a previous
- * generation, which which requires us to assign a new value. If we're the
- * first to use a VMID for the new generation, we must flush necessary caches
- * and TLBs on all CPUs.
- */
-static bool need_new_vmid_gen(struct kvm_vmid *vmid)
-{
-       u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen);
-       smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
-       return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen);
-}
-
-/**
- * update_vmid - Update the vmid with a valid VMID for the current generation
- * @kvm: The guest that struct vmid belongs to
- * @vmid: The stage-2 VMID information struct
- */
-static void update_vmid(struct kvm_vmid *vmid)
-{
-       if (!need_new_vmid_gen(vmid))
-               return;
-
-       spin_lock(&kvm_vmid_lock);
-
-       /*
-        * We need to re-check the vmid_gen here to ensure that if another vcpu
-        * already allocated a valid vmid for this vm, then this vcpu should
-        * use the same vmid.
-        */
-       if (!need_new_vmid_gen(vmid)) {
-               spin_unlock(&kvm_vmid_lock);
-               return;
-       }
-
-       /* First user of a new VMID generation? */
-       if (unlikely(kvm_next_vmid == 0)) {
-               atomic64_inc(&kvm_vmid_gen);
-               kvm_next_vmid = 1;
-
-               /*
-                * On SMP we know no other CPUs can use this CPU's or each
-                * other's VMID after force_vm_exit returns since the
-                * kvm_vmid_lock blocks them from reentry to the guest.
-                */
-               force_vm_exit(cpu_all_mask);
-               /*
-                * Now broadcast TLB + ICACHE invalidation over the inner
-                * shareable domain to make sure all data structures are
-                * clean.
-                */
-               kvm_call_hyp(__kvm_flush_vm_context);
-       }
-
-       vmid->vmid = kvm_next_vmid;
-       kvm_next_vmid++;
-       kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1;
-
-       smp_wmb();
-       WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen));
-
-       spin_unlock(&kvm_vmid_lock);
-}
-
-static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
-{
-       struct kvm *kvm = vcpu->kvm;
-       int ret = 0;
-
-       if (likely(vcpu->arch.has_run_once))
-               return 0;
-
-       if (!kvm_arm_vcpu_is_finalized(vcpu))
-               return -EPERM;
-
-       vcpu->arch.has_run_once = true;
-
-       if (likely(irqchip_in_kernel(kvm))) {
-               /*
-                * Map the VGIC hardware resources before running a vcpu the
-                * first time on this VM.
-                */
-               if (unlikely(!vgic_ready(kvm))) {
-                       ret = kvm_vgic_map_resources(kvm);
-                       if (ret)
-                               return ret;
-               }
-       } else {
-               /*
-                * Tell the rest of the code that there are userspace irqchip
-                * VMs in the wild.
-                */
-               static_branch_inc(&userspace_irqchip_in_use);
-       }
-
-       ret = kvm_timer_enable(vcpu);
-       if (ret)
-               return ret;
-
-       ret = kvm_arm_pmu_v3_enable(vcpu);
-
-       return ret;
-}
-
-bool kvm_arch_intc_initialized(struct kvm *kvm)
-{
-       return vgic_initialized(kvm);
-}
-
-void kvm_arm_halt_guest(struct kvm *kvm)
-{
-       int i;
-       struct kvm_vcpu *vcpu;
-
-       kvm_for_each_vcpu(i, vcpu, kvm)
-               vcpu->arch.pause = true;
-       kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
-}
-
-void kvm_arm_resume_guest(struct kvm *kvm)
-{
-       int i;
-       struct kvm_vcpu *vcpu;
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               vcpu->arch.pause = false;
-               swake_up_one(kvm_arch_vcpu_wq(vcpu));
-       }
-}
-
-static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
-{
-       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
-
-       swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
-                                      (!vcpu->arch.pause)));
-
-       if (vcpu->arch.power_off || vcpu->arch.pause) {
-               /* Awaken to handle a signal, request we sleep again later. */
-               kvm_make_request(KVM_REQ_SLEEP, vcpu);
-       }
-
-       /*
-        * Make sure we will observe a potential reset request if we've
-        * observed a change to the power state. Pairs with the smp_wmb() in
-        * kvm_psci_vcpu_on().
-        */
-       smp_rmb();
-}
-
-static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.target >= 0;
-}
-
-static void check_vcpu_requests(struct kvm_vcpu *vcpu)
-{
-       if (kvm_request_pending(vcpu)) {
-               if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
-                       vcpu_req_sleep(vcpu);
-
-               if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
-                       kvm_reset_vcpu(vcpu);
-
-               /*
-                * Clear IRQ_PENDING requests that were made to guarantee
-                * that a VCPU sees new virtual interrupts.
-                */
-               kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
-
-               if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
-                       kvm_update_stolen_time(vcpu);
-
-               if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
-                       /* The distributor enable bits were changed */
-                       preempt_disable();
-                       vgic_v4_put(vcpu, false);
-                       vgic_v4_load(vcpu);
-                       preempt_enable();
-               }
-       }
-}
-
-/**
- * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
- * @vcpu:      The VCPU pointer
- * @run:       The kvm_run structure pointer used for userspace state exchange
- *
- * This function is called through the VCPU_RUN ioctl called from user space. It
- * will execute VM code in a loop until the time slice for the process is used
- * or some emulation is needed from user space in which case the function will
- * return with return value 0 and with the kvm_run structure filled in with the
- * required data for the requested emulation.
- */
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-       int ret;
-
-       if (unlikely(!kvm_vcpu_initialized(vcpu)))
-               return -ENOEXEC;
-
-       ret = kvm_vcpu_first_run_init(vcpu);
-       if (ret)
-               return ret;
-
-       if (run->exit_reason == KVM_EXIT_MMIO) {
-               ret = kvm_handle_mmio_return(vcpu, vcpu->run);
-               if (ret)
-                       return ret;
-       }
-
-       if (run->immediate_exit)
-               return -EINTR;
-
-       vcpu_load(vcpu);
-
-       kvm_sigset_activate(vcpu);
-
-       ret = 1;
-       run->exit_reason = KVM_EXIT_UNKNOWN;
-       while (ret > 0) {
-               /*
-                * Check conditions before entering the guest
-                */
-               cond_resched();
-
-               update_vmid(&vcpu->kvm->arch.vmid);
-
-               check_vcpu_requests(vcpu);
-
-               /*
-                * Preparing the interrupts to be injected also
-                * involves poking the GIC, which must be done in a
-                * non-preemptible context.
-                */
-               preempt_disable();
-
-               kvm_pmu_flush_hwstate(vcpu);
-
-               local_irq_disable();
-
-               kvm_vgic_flush_hwstate(vcpu);
-
-               /*
-                * Exit if we have a signal pending so that we can deliver the
-                * signal to user space.
-                */
-               if (signal_pending(current)) {
-                       ret = -EINTR;
-                       run->exit_reason = KVM_EXIT_INTR;
-               }
-
-               /*
-                * If we're using a userspace irqchip, then check if we need
-                * to tell a userspace irqchip about timer or PMU level
-                * changes and if so, exit to userspace (the actual level
-                * state gets updated in kvm_timer_update_run and
-                * kvm_pmu_update_run below).
-                */
-               if (static_branch_unlikely(&userspace_irqchip_in_use)) {
-                       if (kvm_timer_should_notify_user(vcpu) ||
-                           kvm_pmu_should_notify_user(vcpu)) {
-                               ret = -EINTR;
-                               run->exit_reason = KVM_EXIT_INTR;
-                       }
-               }
-
-               /*
-                * Ensure we set mode to IN_GUEST_MODE after we disable
-                * interrupts and before the final VCPU requests check.
-                * See the comment in kvm_vcpu_exiting_guest_mode() and
-                * Documentation/virt/kvm/vcpu-requests.rst
-                */
-               smp_store_mb(vcpu->mode, IN_GUEST_MODE);
-
-               if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) ||
-                   kvm_request_pending(vcpu)) {
-                       vcpu->mode = OUTSIDE_GUEST_MODE;
-                       isb(); /* Ensure work in x_flush_hwstate is committed */
-                       kvm_pmu_sync_hwstate(vcpu);
-                       if (static_branch_unlikely(&userspace_irqchip_in_use))
-                               kvm_timer_sync_hwstate(vcpu);
-                       kvm_vgic_sync_hwstate(vcpu);
-                       local_irq_enable();
-                       preempt_enable();
-                       continue;
-               }
-
-               kvm_arm_setup_debug(vcpu);
-
-               /**************************************************************
-                * Enter the guest
-                */
-               trace_kvm_entry(*vcpu_pc(vcpu));
-               guest_enter_irqoff();
-
-               if (has_vhe()) {
-                       ret = kvm_vcpu_run_vhe(vcpu);
-               } else {
-                       ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu);
-               }
-
-               vcpu->mode = OUTSIDE_GUEST_MODE;
-               vcpu->stat.exits++;
-               /*
-                * Back from guest
-                *************************************************************/
-
-               kvm_arm_clear_debug(vcpu);
-
-               /*
-                * We must sync the PMU state before the vgic state so
-                * that the vgic can properly sample the updated state of the
-                * interrupt line.
-                */
-               kvm_pmu_sync_hwstate(vcpu);
-
-               /*
-                * Sync the vgic state before syncing the timer state because
-                * the timer code needs to know if the virtual timer
-                * interrupts are active.
-                */
-               kvm_vgic_sync_hwstate(vcpu);
-
-               /*
-                * Sync the timer hardware state before enabling interrupts as
-                * we don't want vtimer interrupts to race with syncing the
-                * timer virtual interrupt state.
-                */
-               if (static_branch_unlikely(&userspace_irqchip_in_use))
-                       kvm_timer_sync_hwstate(vcpu);
-
-               kvm_arch_vcpu_ctxsync_fp(vcpu);
-
-               /*
-                * We may have taken a host interrupt in HYP mode (ie
-                * while executing the guest). This interrupt is still
-                * pending, as we haven't serviced it yet!
-                *
-                * We're now back in SVC mode, with interrupts
-                * disabled.  Enabling the interrupts now will have
-                * the effect of taking the interrupt again, in SVC
-                * mode this time.
-                */
-               local_irq_enable();
-
-               /*
-                * We do local_irq_enable() before calling guest_exit() so
-                * that if a timer interrupt hits while running the guest we
-                * account that tick as being spent in the guest.  We enable
-                * preemption after calling guest_exit() so that if we get
-                * preempted we make sure ticks after that is not counted as
-                * guest time.
-                */
-               guest_exit();
-               trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
-
-               /* Exit types that need handling before we can be preempted */
-               handle_exit_early(vcpu, run, ret);
-
-               preempt_enable();
-
-               ret = handle_exit(vcpu, run, ret);
-       }
-
-       /* Tell userspace about in-kernel device output levels */
-       if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
-               kvm_timer_update_run(vcpu);
-               kvm_pmu_update_run(vcpu);
-       }
-
-       kvm_sigset_deactivate(vcpu);
-
-       vcpu_put(vcpu);
-       return ret;
-}
-
-static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
-{
-       int bit_index;
-       bool set;
-       unsigned long *hcr;
-
-       if (number == KVM_ARM_IRQ_CPU_IRQ)
-               bit_index = __ffs(HCR_VI);
-       else /* KVM_ARM_IRQ_CPU_FIQ */
-               bit_index = __ffs(HCR_VF);
-
-       hcr = vcpu_hcr(vcpu);
-       if (level)
-               set = test_and_set_bit(bit_index, hcr);
-       else
-               set = test_and_clear_bit(bit_index, hcr);
-
-       /*
-        * If we didn't change anything, no need to wake up or kick other CPUs
-        */
-       if (set == level)
-               return 0;
-
-       /*
-        * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
-        * trigger a world-switch round on the running physical CPU to set the
-        * virtual IRQ/FIQ fields in the HCR appropriately.
-        */
-       kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-       kvm_vcpu_kick(vcpu);
-
-       return 0;
-}
-
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
-                         bool line_status)
-{
-       u32 irq = irq_level->irq;
-       unsigned int irq_type, vcpu_idx, irq_num;
-       int nrcpus = atomic_read(&kvm->online_vcpus);
-       struct kvm_vcpu *vcpu = NULL;
-       bool level = irq_level->level;
-
-       irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
-       vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
-       vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
-       irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
-
-       trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
-
-       switch (irq_type) {
-       case KVM_ARM_IRQ_TYPE_CPU:
-               if (irqchip_in_kernel(kvm))
-                       return -ENXIO;
-
-               if (vcpu_idx >= nrcpus)
-                       return -EINVAL;
-
-               vcpu = kvm_get_vcpu(kvm, vcpu_idx);
-               if (!vcpu)
-                       return -EINVAL;
-
-               if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
-                       return -EINVAL;
-
-               return vcpu_interrupt_line(vcpu, irq_num, level);
-       case KVM_ARM_IRQ_TYPE_PPI:
-               if (!irqchip_in_kernel(kvm))
-                       return -ENXIO;
-
-               if (vcpu_idx >= nrcpus)
-                       return -EINVAL;
-
-               vcpu = kvm_get_vcpu(kvm, vcpu_idx);
-               if (!vcpu)
-                       return -EINVAL;
-
-               if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
-                       return -EINVAL;
-
-               return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
-       case KVM_ARM_IRQ_TYPE_SPI:
-               if (!irqchip_in_kernel(kvm))
-                       return -ENXIO;
-
-               if (irq_num < VGIC_NR_PRIVATE_IRQS)
-                       return -EINVAL;
-
-               return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
-       }
-
-       return -EINVAL;
-}
-
-static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-                              const struct kvm_vcpu_init *init)
-{
-       unsigned int i, ret;
-       int phys_target = kvm_target_cpu();
-
-       if (init->target != phys_target)
-               return -EINVAL;
-
-       /*
-        * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
-        * use the same target.
-        */
-       if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
-               return -EINVAL;
-
-       /* -ENOENT for unknown features, -EINVAL for invalid combinations. */
-       for (i = 0; i < sizeof(init->features) * 8; i++) {
-               bool set = (init->features[i / 32] & (1 << (i % 32)));
-
-               if (set && i >= KVM_VCPU_MAX_FEATURES)
-                       return -ENOENT;
-
-               /*
-                * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
-                * use the same feature set.
-                */
-               if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
-                   test_bit(i, vcpu->arch.features) != set)
-                       return -EINVAL;
-
-               if (set)
-                       set_bit(i, vcpu->arch.features);
-       }
-
-       vcpu->arch.target = phys_target;
-
-       /* Now we know what it is, we can reset it. */
-       ret = kvm_reset_vcpu(vcpu);
-       if (ret) {
-               vcpu->arch.target = -1;
-               bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
-       }
-
-       return ret;
-}
-
-static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
-                                        struct kvm_vcpu_init *init)
-{
-       int ret;
-
-       ret = kvm_vcpu_set_target(vcpu, init);
-       if (ret)
-               return ret;
-
-       /*
-        * Ensure a rebooted VM will fault in RAM pages and detect if the
-        * guest MMU is turned off and flush the caches as needed.
-        */
-       if (vcpu->arch.has_run_once)
-               stage2_unmap_vm(vcpu->kvm);
-
-       vcpu_reset_hcr(vcpu);
-
-       /*
-        * Handle the "start in power-off" case.
-        */
-       if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
-               vcpu_power_off(vcpu);
-       else
-               vcpu->arch.power_off = false;
-
-       return 0;
-}
-
-static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
-                                struct kvm_device_attr *attr)
-{
-       int ret = -ENXIO;
-
-       switch (attr->group) {
-       default:
-               ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
-               break;
-       }
-
-       return ret;
-}
-
-static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
-                                struct kvm_device_attr *attr)
-{
-       int ret = -ENXIO;
-
-       switch (attr->group) {
-       default:
-               ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
-               break;
-       }
-
-       return ret;
-}
-
-static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
-                                struct kvm_device_attr *attr)
-{
-       int ret = -ENXIO;
-
-       switch (attr->group) {
-       default:
-               ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
-               break;
-       }
-
-       return ret;
-}
-
-static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
-                                  struct kvm_vcpu_events *events)
-{
-       memset(events, 0, sizeof(*events));
-
-       return __kvm_arm_vcpu_get_events(vcpu, events);
-}
-
-static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
-                                  struct kvm_vcpu_events *events)
-{
-       int i;
-
-       /* check whether the reserved field is zero */
-       for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
-               if (events->reserved[i])
-                       return -EINVAL;
-
-       /* check whether the pad field is zero */
-       for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
-               if (events->exception.pad[i])
-                       return -EINVAL;
-
-       return __kvm_arm_vcpu_set_events(vcpu, events);
-}
-
-long kvm_arch_vcpu_ioctl(struct file *filp,
-                        unsigned int ioctl, unsigned long arg)
-{
-       struct kvm_vcpu *vcpu = filp->private_data;
-       void __user *argp = (void __user *)arg;
-       struct kvm_device_attr attr;
-       long r;
-
-       switch (ioctl) {
-       case KVM_ARM_VCPU_INIT: {
-               struct kvm_vcpu_init init;
-
-               r = -EFAULT;
-               if (copy_from_user(&init, argp, sizeof(init)))
-                       break;
-
-               r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
-               break;
-       }
-       case KVM_SET_ONE_REG:
-       case KVM_GET_ONE_REG: {
-               struct kvm_one_reg reg;
-
-               r = -ENOEXEC;
-               if (unlikely(!kvm_vcpu_initialized(vcpu)))
-                       break;
-
-               r = -EFAULT;
-               if (copy_from_user(&reg, argp, sizeof(reg)))
-                       break;
-
-               if (ioctl == KVM_SET_ONE_REG)
-                       r = kvm_arm_set_reg(vcpu, &reg);
-               else
-                       r = kvm_arm_get_reg(vcpu, &reg);
-               break;
-       }
-       case KVM_GET_REG_LIST: {
-               struct kvm_reg_list __user *user_list = argp;
-               struct kvm_reg_list reg_list;
-               unsigned n;
-
-               r = -ENOEXEC;
-               if (unlikely(!kvm_vcpu_initialized(vcpu)))
-                       break;
-
-               r = -EPERM;
-               if (!kvm_arm_vcpu_is_finalized(vcpu))
-                       break;
-
-               r = -EFAULT;
-               if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
-                       break;
-               n = reg_list.n;
-               reg_list.n = kvm_arm_num_regs(vcpu);
-               if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
-                       break;
-               r = -E2BIG;
-               if (n < reg_list.n)
-                       break;
-               r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
-               break;
-       }
-       case KVM_SET_DEVICE_ATTR: {
-               r = -EFAULT;
-               if (copy_from_user(&attr, argp, sizeof(attr)))
-                       break;
-               r = kvm_arm_vcpu_set_attr(vcpu, &attr);
-               break;
-       }
-       case KVM_GET_DEVICE_ATTR: {
-               r = -EFAULT;
-               if (copy_from_user(&attr, argp, sizeof(attr)))
-                       break;
-               r = kvm_arm_vcpu_get_attr(vcpu, &attr);
-               break;
-       }
-       case KVM_HAS_DEVICE_ATTR: {
-               r = -EFAULT;
-               if (copy_from_user(&attr, argp, sizeof(attr)))
-                       break;
-               r = kvm_arm_vcpu_has_attr(vcpu, &attr);
-               break;
-       }
-       case KVM_GET_VCPU_EVENTS: {
-               struct kvm_vcpu_events events;
-
-               if (kvm_arm_vcpu_get_events(vcpu, &events))
-                       return -EINVAL;
-
-               if (copy_to_user(argp, &events, sizeof(events)))
-                       return -EFAULT;
-
-               return 0;
-       }
-       case KVM_SET_VCPU_EVENTS: {
-               struct kvm_vcpu_events events;
-
-               if (copy_from_user(&events, argp, sizeof(events)))
-                       return -EFAULT;
-
-               return kvm_arm_vcpu_set_events(vcpu, &events);
-       }
-       case KVM_ARM_VCPU_FINALIZE: {
-               int what;
-
-               if (!kvm_vcpu_initialized(vcpu))
-                       return -ENOEXEC;
-
-               if (get_user(what, (const int __user *)argp))
-                       return -EFAULT;
-
-               return kvm_arm_vcpu_finalize(vcpu, what);
-       }
-       default:
-               r = -EINVAL;
-       }
-
-       return r;
-}
-
-void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
-{
-
-}
-
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot)
-{
-       kvm_flush_remote_tlbs(kvm);
-}
-
-static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
-                                       struct kvm_arm_device_addr *dev_addr)
-{
-       unsigned long dev_id, type;
-
-       dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >>
-               KVM_ARM_DEVICE_ID_SHIFT;
-       type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >>
-               KVM_ARM_DEVICE_TYPE_SHIFT;
-
-       switch (dev_id) {
-       case KVM_ARM_DEVICE_VGIC_V2:
-               if (!vgic_present)
-                       return -ENXIO;
-               return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
-       default:
-               return -ENODEV;
-       }
-}
-
-long kvm_arch_vm_ioctl(struct file *filp,
-                      unsigned int ioctl, unsigned long arg)
-{
-       struct kvm *kvm = filp->private_data;
-       void __user *argp = (void __user *)arg;
-
-       switch (ioctl) {
-       case KVM_CREATE_IRQCHIP: {
-               int ret;
-               if (!vgic_present)
-                       return -ENXIO;
-               mutex_lock(&kvm->lock);
-               ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
-               mutex_unlock(&kvm->lock);
-               return ret;
-       }
-       case KVM_ARM_SET_DEVICE_ADDR: {
-               struct kvm_arm_device_addr dev_addr;
-
-               if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
-                       return -EFAULT;
-               return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
-       }
-       case KVM_ARM_PREFERRED_TARGET: {
-               int err;
-               struct kvm_vcpu_init init;
-
-               err = kvm_vcpu_preferred_target(&init);
-               if (err)
-                       return err;
-
-               if (copy_to_user(argp, &init, sizeof(init)))
-                       return -EFAULT;
-
-               return 0;
-       }
-       default:
-               return -EINVAL;
-       }
-}
-
-static void cpu_init_hyp_mode(void)
-{
-       phys_addr_t pgd_ptr;
-       unsigned long hyp_stack_ptr;
-       unsigned long stack_page;
-       unsigned long vector_ptr;
-
-       /* Switch from the HYP stub to our own HYP init vector */
-       __hyp_set_vectors(kvm_get_idmap_vector());
-
-       pgd_ptr = kvm_mmu_get_httbr();
-       stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
-       hyp_stack_ptr = stack_page + PAGE_SIZE;
-       vector_ptr = (unsigned long)kvm_get_hyp_vector();
-
-       __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
-       __cpu_init_stage2();
-}
-
-static void cpu_hyp_reset(void)
-{
-       if (!is_kernel_in_hyp_mode())
-               __hyp_reset_vectors();
-}
-
-static void cpu_hyp_reinit(void)
-{
-       kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt);
-
-       cpu_hyp_reset();
-
-       if (is_kernel_in_hyp_mode())
-               kvm_timer_init_vhe();
-       else
-               cpu_init_hyp_mode();
-
-       kvm_arm_init_debug();
-
-       if (vgic_present)
-               kvm_vgic_init_cpu_hardware();
-}
-
-static void _kvm_arch_hardware_enable(void *discard)
-{
-       if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
-               cpu_hyp_reinit();
-               __this_cpu_write(kvm_arm_hardware_enabled, 1);
-       }
-}
-
-int kvm_arch_hardware_enable(void)
-{
-       _kvm_arch_hardware_enable(NULL);
-       return 0;
-}
-
-static void _kvm_arch_hardware_disable(void *discard)
-{
-       if (__this_cpu_read(kvm_arm_hardware_enabled)) {
-               cpu_hyp_reset();
-               __this_cpu_write(kvm_arm_hardware_enabled, 0);
-       }
-}
-
-void kvm_arch_hardware_disable(void)
-{
-       _kvm_arch_hardware_disable(NULL);
-}
-
-#ifdef CONFIG_CPU_PM
-static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
-                                   unsigned long cmd,
-                                   void *v)
-{
-       /*
-        * kvm_arm_hardware_enabled is left with its old value over
-        * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
-        * re-enable hyp.
-        */
-       switch (cmd) {
-       case CPU_PM_ENTER:
-               if (__this_cpu_read(kvm_arm_hardware_enabled))
-                       /*
-                        * don't update kvm_arm_hardware_enabled here
-                        * so that the hardware will be re-enabled
-                        * when we resume. See below.
-                        */
-                       cpu_hyp_reset();
-
-               return NOTIFY_OK;
-       case CPU_PM_ENTER_FAILED:
-       case CPU_PM_EXIT:
-               if (__this_cpu_read(kvm_arm_hardware_enabled))
-                       /* The hardware was enabled before suspend. */
-                       cpu_hyp_reinit();
-
-               return NOTIFY_OK;
-
-       default:
-               return NOTIFY_DONE;
-       }
-}
-
-static struct notifier_block hyp_init_cpu_pm_nb = {
-       .notifier_call = hyp_init_cpu_pm_notifier,
-};
-
-static void __init hyp_cpu_pm_init(void)
-{
-       cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
-}
-static void __init hyp_cpu_pm_exit(void)
-{
-       cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
-}
-#else
-static inline void hyp_cpu_pm_init(void)
-{
-}
-static inline void hyp_cpu_pm_exit(void)
-{
-}
-#endif
-
-static int init_common_resources(void)
-{
-       kvm_set_ipa_limit();
-
-       return 0;
-}
-
-static int init_subsystems(void)
-{
-       int err = 0;
-
-       /*
-        * Enable hardware so that subsystem initialisation can access EL2.
-        */
-       on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
-
-       /*
-        * Register CPU lower-power notifier
-        */
-       hyp_cpu_pm_init();
-
-       /*
-        * Init HYP view of VGIC
-        */
-       err = kvm_vgic_hyp_init();
-       switch (err) {
-       case 0:
-               vgic_present = true;
-               break;
-       case -ENODEV:
-       case -ENXIO:
-               vgic_present = false;
-               err = 0;
-               break;
-       default:
-               goto out;
-       }
-
-       /*
-        * Init HYP architected timer support
-        */
-       err = kvm_timer_hyp_init(vgic_present);
-       if (err)
-               goto out;
-
-       kvm_perf_init();
-       kvm_coproc_table_init();
-
-out:
-       on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
-
-       return err;
-}
-
-static void teardown_hyp_mode(void)
-{
-       int cpu;
-
-       free_hyp_pgds();
-       for_each_possible_cpu(cpu)
-               free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
-}
-
-/**
- * Inits Hyp-mode on all online CPUs
- */
-static int init_hyp_mode(void)
-{
-       int cpu;
-       int err = 0;
-
-       /*
-        * Allocate Hyp PGD and setup Hyp identity mapping
-        */
-       err = kvm_mmu_init();
-       if (err)
-               goto out_err;
-
-       /*
-        * Allocate stack pages for Hypervisor-mode
-        */
-       for_each_possible_cpu(cpu) {
-               unsigned long stack_page;
-
-               stack_page = __get_free_page(GFP_KERNEL);
-               if (!stack_page) {
-                       err = -ENOMEM;
-                       goto out_err;
-               }
-
-               per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
-       }
-
-       /*
-        * Map the Hyp-code called directly from the host
-        */
-       err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
-                                 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
-       if (err) {
-               kvm_err("Cannot map world-switch code\n");
-               goto out_err;
-       }
-
-       err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
-                                 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
-       if (err) {
-               kvm_err("Cannot map rodata section\n");
-               goto out_err;
-       }
-
-       err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
-                                 kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
-       if (err) {
-               kvm_err("Cannot map bss section\n");
-               goto out_err;
-       }
-
-       err = kvm_map_vectors();
-       if (err) {
-               kvm_err("Cannot map vectors\n");
-               goto out_err;
-       }
-
-       /*
-        * Map the Hyp stack pages
-        */
-       for_each_possible_cpu(cpu) {
-               char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
-               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
-                                         PAGE_HYP);
-
-               if (err) {
-                       kvm_err("Cannot map hyp stack\n");
-                       goto out_err;
-               }
-       }
-
-       for_each_possible_cpu(cpu) {
-               kvm_host_data_t *cpu_data;
-
-               cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
-               err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
-
-               if (err) {
-                       kvm_err("Cannot map host CPU state: %d\n", err);
-                       goto out_err;
-               }
-       }
-
-       err = hyp_map_aux_data();
-       if (err)
-               kvm_err("Cannot map host auxiliary data: %d\n", err);
-
-       return 0;
-
-out_err:
-       teardown_hyp_mode();
-       kvm_err("error initializing Hyp mode: %d\n", err);
-       return err;
-}
-
-static void check_kvm_target_cpu(void *ret)
-{
-       *(int *)ret = kvm_target_cpu();
-}
-
-struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
-{
-       struct kvm_vcpu *vcpu;
-       int i;
-
-       mpidr &= MPIDR_HWID_BITMASK;
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
-                       return vcpu;
-       }
-       return NULL;
-}
-
-bool kvm_arch_has_irq_bypass(void)
-{
-       return true;
-}
-
-int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
-                                     struct irq_bypass_producer *prod)
-{
-       struct kvm_kernel_irqfd *irqfd =
-               container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-       return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
-                                         &irqfd->irq_entry);
-}
-void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
-                                     struct irq_bypass_producer *prod)
-{
-       struct kvm_kernel_irqfd *irqfd =
-               container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-       kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
-                                    &irqfd->irq_entry);
-}
-
-void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
-{
-       struct kvm_kernel_irqfd *irqfd =
-               container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-       kvm_arm_halt_guest(irqfd->kvm);
-}
-
-void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
-{
-       struct kvm_kernel_irqfd *irqfd =
-               container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-       kvm_arm_resume_guest(irqfd->kvm);
-}
-
-/**
- * Initialize Hyp-mode and memory mappings on all CPUs.
- */
-int kvm_arch_init(void *opaque)
-{
-       int err;
-       int ret, cpu;
-       bool in_hyp_mode;
-
-       if (!is_hyp_mode_available()) {
-               kvm_info("HYP mode not available\n");
-               return -ENODEV;
-       }
-
-       in_hyp_mode = is_kernel_in_hyp_mode();
-
-       if (!in_hyp_mode && kvm_arch_requires_vhe()) {
-               kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
-               return -ENODEV;
-       }
-
-       for_each_online_cpu(cpu) {
-               smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
-               if (ret < 0) {
-                       kvm_err("Error, CPU %d not supported!\n", cpu);
-                       return -ENODEV;
-               }
-       }
-
-       err = init_common_resources();
-       if (err)
-               return err;
-
-       err = kvm_arm_init_sve();
-       if (err)
-               return err;
-
-       if (!in_hyp_mode) {
-               err = init_hyp_mode();
-               if (err)
-                       goto out_err;
-       }
-
-       err = init_subsystems();
-       if (err)
-               goto out_hyp;
-
-       if (in_hyp_mode)
-               kvm_info("VHE mode initialized successfully\n");
-       else
-               kvm_info("Hyp mode initialized successfully\n");
-
-       return 0;
-
-out_hyp:
-       hyp_cpu_pm_exit();
-       if (!in_hyp_mode)
-               teardown_hyp_mode();
-out_err:
-       return err;
-}
-
-/* NOP: Compiling as a module not supported */
-void kvm_arch_exit(void)
-{
-       kvm_perf_teardown();
-}
-
-static int arm_init(void)
-{
-       int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
-       return rc;
-}
-
-module_init(arm_init);
diff --git a/virt/kvm/arm/hyp/aarch32.c b/virt/kvm/arm/hyp/aarch32.c
deleted file mode 100644 (file)
index 25c0e47..0000000
+++ /dev/null
@@ -1,140 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Hyp portion of the (not much of an) Emulation layer for 32bit guests.
- *
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * based on arch/arm/kvm/emulate.c
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/kvm_host.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-
-/*
- * stolen from arch/arm/kernel/opcodes.c
- *
- * condition code lookup table
- * index into the table is test code: EQ, NE, ... LT, GT, AL, NV
- *
- * bit position in short is condition code: NZCV
- */
-static const unsigned short cc_map[16] = {
-       0xF0F0,                 /* EQ == Z set            */
-       0x0F0F,                 /* NE                     */
-       0xCCCC,                 /* CS == C set            */
-       0x3333,                 /* CC                     */
-       0xFF00,                 /* MI == N set            */
-       0x00FF,                 /* PL                     */
-       0xAAAA,                 /* VS == V set            */
-       0x5555,                 /* VC                     */
-       0x0C0C,                 /* HI == C set && Z clear */
-       0xF3F3,                 /* LS == C clear || Z set */
-       0xAA55,                 /* GE == (N==V)           */
-       0x55AA,                 /* LT == (N!=V)           */
-       0x0A05,                 /* GT == (!Z && (N==V))   */
-       0xF5FA,                 /* LE == (Z || (N!=V))    */
-       0xFFFF,                 /* AL always              */
-       0                       /* NV                     */
-};
-
-/*
- * Check if a trapped instruction should have been executed or not.
- */
-bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu)
-{
-       unsigned long cpsr;
-       u32 cpsr_cond;
-       int cond;
-
-       /* Top two bits non-zero?  Unconditional. */
-       if (kvm_vcpu_get_hsr(vcpu) >> 30)
-               return true;
-
-       /* Is condition field valid? */
-       cond = kvm_vcpu_get_condition(vcpu);
-       if (cond == 0xE)
-               return true;
-
-       cpsr = *vcpu_cpsr(vcpu);
-
-       if (cond < 0) {
-               /* This can happen in Thumb mode: examine IT state. */
-               unsigned long it;
-
-               it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
-
-               /* it == 0 => unconditional. */
-               if (it == 0)
-                       return true;
-
-               /* The cond for this insn works out as the top 4 bits. */
-               cond = (it >> 4);
-       }
-
-       cpsr_cond = cpsr >> 28;
-
-       if (!((cc_map[cond] >> cpsr_cond) & 1))
-               return false;
-
-       return true;
-}
-
-/**
- * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
- * @vcpu:      The VCPU pointer
- *
- * When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
- * to do this little bit of work manually. The fields map like this:
- *
- * IT[7:0] -> CPSR[26:25],CPSR[15:10]
- */
-static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu)
-{
-       unsigned long itbits, cond;
-       unsigned long cpsr = *vcpu_cpsr(vcpu);
-       bool is_arm = !(cpsr & PSR_AA32_T_BIT);
-
-       if (is_arm || !(cpsr & PSR_AA32_IT_MASK))
-               return;
-
-       cond = (cpsr & 0xe000) >> 13;
-       itbits = (cpsr & 0x1c00) >> (10 - 2);
-       itbits |= (cpsr & (0x3 << 25)) >> 25;
-
-       /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */
-       if ((itbits & 0x7) == 0)
-               itbits = cond = 0;
-       else
-               itbits = (itbits << 1) & 0x1f;
-
-       cpsr &= ~PSR_AA32_IT_MASK;
-       cpsr |= cond << 13;
-       cpsr |= (itbits & 0x1c) << (10 - 2);
-       cpsr |= (itbits & 0x3) << 25;
-       *vcpu_cpsr(vcpu) = cpsr;
-}
-
-/**
- * kvm_skip_instr - skip a trapped instruction and proceed to the next
- * @vcpu: The vcpu pointer
- */
-void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
-{
-       u32 pc = *vcpu_pc(vcpu);
-       bool is_thumb;
-
-       is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT);
-       if (is_thumb && !is_wide_instr)
-               pc += 2;
-       else
-               pc += 4;
-
-       *vcpu_pc(vcpu) = pc;
-
-       kvm_adjust_itstate(vcpu);
-}
diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c
deleted file mode 100644 (file)
index ff76e68..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012-2015 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <clocksource/arm_arch_timer.h>
-#include <linux/compiler.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_hyp.h>
-
-void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high)
-{
-       u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low;
-       write_sysreg(cntvoff, cntvoff_el2);
-}
-
-/*
- * Should only be called on non-VHE systems.
- * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
- */
-void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu)
-{
-       u64 val;
-
-       /* Allow physical timer/counter access for the host */
-       val = read_sysreg(cnthctl_el2);
-       val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
-       write_sysreg(val, cnthctl_el2);
-}
-
-/*
- * Should only be called on non-VHE systems.
- * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
- */
-void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu)
-{
-       u64 val;
-
-       /*
-        * Disallow physical timer access for the guest
-        * Physical counter access is allowed
-        */
-       val = read_sysreg(cnthctl_el2);
-       val &= ~CNTHCTL_EL1PCEN;
-       val |= CNTHCTL_EL1PCTEN;
-       write_sysreg(val, cnthctl_el2);
-}
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
deleted file mode 100644 (file)
index ccf1fde..0000000
+++ /dev/null
@@ -1,1130 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012-2015 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-#include <asm/kvm_mmu.h>
-
-#define vtr_to_max_lr_idx(v)           ((v) & 0xf)
-#define vtr_to_nr_pre_bits(v)          ((((u32)(v) >> 26) & 7) + 1)
-#define vtr_to_nr_apr_regs(v)          (1 << (vtr_to_nr_pre_bits(v) - 5))
-
-static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
-{
-       switch (lr & 0xf) {
-       case 0:
-               return read_gicreg(ICH_LR0_EL2);
-       case 1:
-               return read_gicreg(ICH_LR1_EL2);
-       case 2:
-               return read_gicreg(ICH_LR2_EL2);
-       case 3:
-               return read_gicreg(ICH_LR3_EL2);
-       case 4:
-               return read_gicreg(ICH_LR4_EL2);
-       case 5:
-               return read_gicreg(ICH_LR5_EL2);
-       case 6:
-               return read_gicreg(ICH_LR6_EL2);
-       case 7:
-               return read_gicreg(ICH_LR7_EL2);
-       case 8:
-               return read_gicreg(ICH_LR8_EL2);
-       case 9:
-               return read_gicreg(ICH_LR9_EL2);
-       case 10:
-               return read_gicreg(ICH_LR10_EL2);
-       case 11:
-               return read_gicreg(ICH_LR11_EL2);
-       case 12:
-               return read_gicreg(ICH_LR12_EL2);
-       case 13:
-               return read_gicreg(ICH_LR13_EL2);
-       case 14:
-               return read_gicreg(ICH_LR14_EL2);
-       case 15:
-               return read_gicreg(ICH_LR15_EL2);
-       }
-
-       unreachable();
-}
-
-static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
-{
-       switch (lr & 0xf) {
-       case 0:
-               write_gicreg(val, ICH_LR0_EL2);
-               break;
-       case 1:
-               write_gicreg(val, ICH_LR1_EL2);
-               break;
-       case 2:
-               write_gicreg(val, ICH_LR2_EL2);
-               break;
-       case 3:
-               write_gicreg(val, ICH_LR3_EL2);
-               break;
-       case 4:
-               write_gicreg(val, ICH_LR4_EL2);
-               break;
-       case 5:
-               write_gicreg(val, ICH_LR5_EL2);
-               break;
-       case 6:
-               write_gicreg(val, ICH_LR6_EL2);
-               break;
-       case 7:
-               write_gicreg(val, ICH_LR7_EL2);
-               break;
-       case 8:
-               write_gicreg(val, ICH_LR8_EL2);
-               break;
-       case 9:
-               write_gicreg(val, ICH_LR9_EL2);
-               break;
-       case 10:
-               write_gicreg(val, ICH_LR10_EL2);
-               break;
-       case 11:
-               write_gicreg(val, ICH_LR11_EL2);
-               break;
-       case 12:
-               write_gicreg(val, ICH_LR12_EL2);
-               break;
-       case 13:
-               write_gicreg(val, ICH_LR13_EL2);
-               break;
-       case 14:
-               write_gicreg(val, ICH_LR14_EL2);
-               break;
-       case 15:
-               write_gicreg(val, ICH_LR15_EL2);
-               break;
-       }
-}
-
-static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n)
-{
-       switch (n) {
-       case 0:
-               write_gicreg(val, ICH_AP0R0_EL2);
-               break;
-       case 1:
-               write_gicreg(val, ICH_AP0R1_EL2);
-               break;
-       case 2:
-               write_gicreg(val, ICH_AP0R2_EL2);
-               break;
-       case 3:
-               write_gicreg(val, ICH_AP0R3_EL2);
-               break;
-       }
-}
-
-static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n)
-{
-       switch (n) {
-       case 0:
-               write_gicreg(val, ICH_AP1R0_EL2);
-               break;
-       case 1:
-               write_gicreg(val, ICH_AP1R1_EL2);
-               break;
-       case 2:
-               write_gicreg(val, ICH_AP1R2_EL2);
-               break;
-       case 3:
-               write_gicreg(val, ICH_AP1R3_EL2);
-               break;
-       }
-}
-
-static u32 __hyp_text __vgic_v3_read_ap0rn(int n)
-{
-       u32 val;
-
-       switch (n) {
-       case 0:
-               val = read_gicreg(ICH_AP0R0_EL2);
-               break;
-       case 1:
-               val = read_gicreg(ICH_AP0R1_EL2);
-               break;
-       case 2:
-               val = read_gicreg(ICH_AP0R2_EL2);
-               break;
-       case 3:
-               val = read_gicreg(ICH_AP0R3_EL2);
-               break;
-       default:
-               unreachable();
-       }
-
-       return val;
-}
-
-static u32 __hyp_text __vgic_v3_read_ap1rn(int n)
-{
-       u32 val;
-
-       switch (n) {
-       case 0:
-               val = read_gicreg(ICH_AP1R0_EL2);
-               break;
-       case 1:
-               val = read_gicreg(ICH_AP1R1_EL2);
-               break;
-       case 2:
-               val = read_gicreg(ICH_AP1R2_EL2);
-               break;
-       case 3:
-               val = read_gicreg(ICH_AP1R3_EL2);
-               break;
-       default:
-               unreachable();
-       }
-
-       return val;
-}
-
-void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-       u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-
-       /*
-        * Make sure stores to the GIC via the memory mapped interface
-        * are now visible to the system register interface when reading the
-        * LRs, and when reading back the VMCR on non-VHE systems.
-        */
-       if (used_lrs || !has_vhe()) {
-               if (!cpu_if->vgic_sre) {
-                       dsb(sy);
-                       isb();
-               }
-       }
-
-       if (used_lrs || cpu_if->its_vpe.its_vm) {
-               int i;
-               u32 elrsr;
-
-               elrsr = read_gicreg(ICH_ELRSR_EL2);
-
-               write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
-
-               for (i = 0; i < used_lrs; i++) {
-                       if (elrsr & (1 << i))
-                               cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
-                       else
-                               cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
-
-                       __gic_v3_set_lr(0, i);
-               }
-       }
-}
-
-void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-       u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-       int i;
-
-       if (used_lrs || cpu_if->its_vpe.its_vm) {
-               write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
-
-               for (i = 0; i < used_lrs; i++)
-                       __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
-       }
-
-       /*
-        * Ensure that writes to the LRs, and on non-VHE systems ensure that
-        * the write to the VMCR in __vgic_v3_activate_traps(), will have
-        * reached the (re)distributors. This ensure the guest will read the
-        * correct values from the memory-mapped interface.
-        */
-       if (used_lrs || !has_vhe()) {
-               if (!cpu_if->vgic_sre) {
-                       isb();
-                       dsb(sy);
-               }
-       }
-}
-
-void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-       /*
-        * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
-        * Group0 interrupt (as generated in GICv2 mode) to be
-        * delivered as a FIQ to the guest, with potentially fatal
-        * consequences. So we must make sure that ICC_SRE_EL1 has
-        * been actually programmed with the value we want before
-        * starting to mess with the rest of the GIC, and VMCR_EL2 in
-        * particular.  This logic must be called before
-        * __vgic_v3_restore_state().
-        */
-       if (!cpu_if->vgic_sre) {
-               write_gicreg(0, ICC_SRE_EL1);
-               isb();
-               write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
-
-
-               if (has_vhe()) {
-                       /*
-                        * Ensure that the write to the VMCR will have reached
-                        * the (re)distributors. This ensure the guest will
-                        * read the correct values from the memory-mapped
-                        * interface.
-                        */
-                       isb();
-                       dsb(sy);
-               }
-       }
-
-       /*
-        * Prevent the guest from touching the GIC system registers if
-        * SRE isn't enabled for GICv3 emulation.
-        */
-       write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
-                    ICC_SRE_EL2);
-
-       /*
-        * If we need to trap system registers, we must write
-        * ICH_HCR_EL2 anyway, even if no interrupts are being
-        * injected,
-        */
-       if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
-           cpu_if->its_vpe.its_vm)
-               write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
-}
-
-void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-       u64 val;
-
-       if (!cpu_if->vgic_sre) {
-               cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
-       }
-
-       val = read_gicreg(ICC_SRE_EL2);
-       write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
-
-       if (!cpu_if->vgic_sre) {
-               /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
-               isb();
-               write_gicreg(1, ICC_SRE_EL1);
-       }
-
-       /*
-        * If we were trapping system registers, we enabled the VGIC even if
-        * no interrupts were being injected, and we disable it again here.
-        */
-       if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
-           cpu_if->its_vpe.its_vm)
-               write_gicreg(0, ICH_HCR_EL2);
-}
-
-void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if;
-       u64 val;
-       u32 nr_pre_bits;
-
-       vcpu = kern_hyp_va(vcpu);
-       cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-       val = read_gicreg(ICH_VTR_EL2);
-       nr_pre_bits = vtr_to_nr_pre_bits(val);
-
-       switch (nr_pre_bits) {
-       case 7:
-               cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
-               cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
-               /* Fall through */
-       case 6:
-               cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
-               /* Fall through */
-       default:
-               cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
-       }
-
-       switch (nr_pre_bits) {
-       case 7:
-               cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
-               cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
-               /* Fall through */
-       case 6:
-               cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
-               /* Fall through */
-       default:
-               cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
-       }
-}
-
-void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if;
-       u64 val;
-       u32 nr_pre_bits;
-
-       vcpu = kern_hyp_va(vcpu);
-       cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-       val = read_gicreg(ICH_VTR_EL2);
-       nr_pre_bits = vtr_to_nr_pre_bits(val);
-
-       switch (nr_pre_bits) {
-       case 7:
-               __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
-               __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
-               /* Fall through */
-       case 6:
-               __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
-               /* Fall through */
-       default:
-               __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
-       }
-
-       switch (nr_pre_bits) {
-       case 7:
-               __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
-               __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
-               /* Fall through */
-       case 6:
-               __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
-               /* Fall through */
-       default:
-               __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
-       }
-}
-
-void __hyp_text __vgic_v3_init_lrs(void)
-{
-       int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
-       int i;
-
-       for (i = 0; i <= max_lr_idx; i++)
-               __gic_v3_set_lr(0, i);
-}
-
-u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void)
-{
-       return read_gicreg(ICH_VTR_EL2);
-}
-
-u64 __hyp_text __vgic_v3_read_vmcr(void)
-{
-       return read_gicreg(ICH_VMCR_EL2);
-}
-
-void __hyp_text __vgic_v3_write_vmcr(u32 vmcr)
-{
-       write_gicreg(vmcr, ICH_VMCR_EL2);
-}
-
-#ifdef CONFIG_ARM64
-
-static int __hyp_text __vgic_v3_bpr_min(void)
-{
-       /* See Pseudocode for VPriorityGroup */
-       return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2));
-}
-
-static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu)
-{
-       u32 esr = kvm_vcpu_get_hsr(vcpu);
-       u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT;
-
-       return crm != 8;
-}
-
-#define GICv3_IDLE_PRIORITY    0xff
-
-static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu,
-                                                   u32 vmcr,
-                                                   u64 *lr_val)
-{
-       unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-       u8 priority = GICv3_IDLE_PRIORITY;
-       int i, lr = -1;
-
-       for (i = 0; i < used_lrs; i++) {
-               u64 val = __gic_v3_get_lr(i);
-               u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
-
-               /* Not pending in the state? */
-               if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT)
-                       continue;
-
-               /* Group-0 interrupt, but Group-0 disabled? */
-               if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK))
-                       continue;
-
-               /* Group-1 interrupt, but Group-1 disabled? */
-               if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK))
-                       continue;
-
-               /* Not the highest priority? */
-               if (lr_prio >= priority)
-                       continue;
-
-               /* This is a candidate */
-               priority = lr_prio;
-               *lr_val = val;
-               lr = i;
-       }
-
-       if (lr == -1)
-               *lr_val = ICC_IAR1_EL1_SPURIOUS;
-
-       return lr;
-}
-
-static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu,
-                                              int intid, u64 *lr_val)
-{
-       unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-       int i;
-
-       for (i = 0; i < used_lrs; i++) {
-               u64 val = __gic_v3_get_lr(i);
-
-               if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid &&
-                   (val & ICH_LR_ACTIVE_BIT)) {
-                       *lr_val = val;
-                       return i;
-               }
-       }
-
-       *lr_val = ICC_IAR1_EL1_SPURIOUS;
-       return -1;
-}
-
-static int __hyp_text __vgic_v3_get_highest_active_priority(void)
-{
-       u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
-       u32 hap = 0;
-       int i;
-
-       for (i = 0; i < nr_apr_regs; i++) {
-               u32 val;
-
-               /*
-                * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers
-                * contain the active priority levels for this VCPU
-                * for the maximum number of supported priority
-                * levels, and we return the full priority level only
-                * if the BPR is programmed to its minimum, otherwise
-                * we return a combination of the priority level and
-                * subpriority, as determined by the setting of the
-                * BPR, but without the full subpriority.
-                */
-               val  = __vgic_v3_read_ap0rn(i);
-               val |= __vgic_v3_read_ap1rn(i);
-               if (!val) {
-                       hap += 32;
-                       continue;
-               }
-
-               return (hap + __ffs(val)) << __vgic_v3_bpr_min();
-       }
-
-       return GICv3_IDLE_PRIORITY;
-}
-
-static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr)
-{
-       return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
-}
-
-static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr)
-{
-       unsigned int bpr;
-
-       if (vmcr & ICH_VMCR_CBPR_MASK) {
-               bpr = __vgic_v3_get_bpr0(vmcr);
-               if (bpr < 7)
-                       bpr++;
-       } else {
-               bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
-       }
-
-       return bpr;
-}
-
-/*
- * Convert a priority to a preemption level, taking the relevant BPR
- * into account by zeroing the sub-priority bits.
- */
-static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp)
-{
-       unsigned int bpr;
-
-       if (!grp)
-               bpr = __vgic_v3_get_bpr0(vmcr) + 1;
-       else
-               bpr = __vgic_v3_get_bpr1(vmcr);
-
-       return pri & (GENMASK(7, 0) << bpr);
-}
-
-/*
- * The priority value is independent of any of the BPR values, so we
- * normalize it using the minumal BPR value. This guarantees that no
- * matter what the guest does with its BPR, we can always set/get the
- * same value of a priority.
- */
-static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp)
-{
-       u8 pre, ap;
-       u32 val;
-       int apr;
-
-       pre = __vgic_v3_pri_to_pre(pri, vmcr, grp);
-       ap = pre >> __vgic_v3_bpr_min();
-       apr = ap / 32;
-
-       if (!grp) {
-               val = __vgic_v3_read_ap0rn(apr);
-               __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr);
-       } else {
-               val = __vgic_v3_read_ap1rn(apr);
-               __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr);
-       }
-}
-
-static int __hyp_text __vgic_v3_clear_highest_active_priority(void)
-{
-       u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
-       u32 hap = 0;
-       int i;
-
-       for (i = 0; i < nr_apr_regs; i++) {
-               u32 ap0, ap1;
-               int c0, c1;
-
-               ap0 = __vgic_v3_read_ap0rn(i);
-               ap1 = __vgic_v3_read_ap1rn(i);
-               if (!ap0 && !ap1) {
-                       hap += 32;
-                       continue;
-               }
-
-               c0 = ap0 ? __ffs(ap0) : 32;
-               c1 = ap1 ? __ffs(ap1) : 32;
-
-               /* Always clear the LSB, which is the highest priority */
-               if (c0 < c1) {
-                       ap0 &= ~BIT(c0);
-                       __vgic_v3_write_ap0rn(ap0, i);
-                       hap += c0;
-               } else {
-                       ap1 &= ~BIT(c1);
-                       __vgic_v3_write_ap1rn(ap1, i);
-                       hap += c1;
-               }
-
-               /* Rescale to 8 bits of priority */
-               return hap << __vgic_v3_bpr_min();
-       }
-
-       return GICv3_IDLE_PRIORITY;
-}
-
-static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-       u64 lr_val;
-       u8 lr_prio, pmr;
-       int lr, grp;
-
-       grp = __vgic_v3_get_group(vcpu);
-
-       lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
-       if (lr < 0)
-               goto spurious;
-
-       if (grp != !!(lr_val & ICH_LR_GROUP))
-               goto spurious;
-
-       pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
-       lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
-       if (pmr <= lr_prio)
-               goto spurious;
-
-       if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp))
-               goto spurious;
-
-       lr_val &= ~ICH_LR_STATE;
-       /* No active state for LPIs */
-       if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI)
-               lr_val |= ICH_LR_ACTIVE_BIT;
-       __gic_v3_set_lr(lr_val, lr);
-       __vgic_v3_set_active_priority(lr_prio, vmcr, grp);
-       vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
-       return;
-
-spurious:
-       vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS);
-}
-
-static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val)
-{
-       lr_val &= ~ICH_LR_ACTIVE_BIT;
-       if (lr_val & ICH_LR_HW) {
-               u32 pid;
-
-               pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT;
-               gic_write_dir(pid);
-       }
-
-       __gic_v3_set_lr(lr_val, lr);
-}
-
-static void __hyp_text __vgic_v3_bump_eoicount(void)
-{
-       u32 hcr;
-
-       hcr = read_gicreg(ICH_HCR_EL2);
-       hcr += 1 << ICH_HCR_EOIcount_SHIFT;
-       write_gicreg(hcr, ICH_HCR_EL2);
-}
-
-static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu,
-                                          u32 vmcr, int rt)
-{
-       u32 vid = vcpu_get_reg(vcpu, rt);
-       u64 lr_val;
-       int lr;
-
-       /* EOImode == 0, nothing to be done here */
-       if (!(vmcr & ICH_VMCR_EOIM_MASK))
-               return;
-
-       /* No deactivate to be performed on an LPI */
-       if (vid >= VGIC_MIN_LPI)
-               return;
-
-       lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
-       if (lr == -1) {
-               __vgic_v3_bump_eoicount();
-               return;
-       }
-
-       __vgic_v3_clear_active_lr(lr, lr_val);
-}
-
-static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-       u32 vid = vcpu_get_reg(vcpu, rt);
-       u64 lr_val;
-       u8 lr_prio, act_prio;
-       int lr, grp;
-
-       grp = __vgic_v3_get_group(vcpu);
-
-       /* Drop priority in any case */
-       act_prio = __vgic_v3_clear_highest_active_priority();
-
-       /* If EOIing an LPI, no deactivate to be performed */
-       if (vid >= VGIC_MIN_LPI)
-               return;
-
-       /* EOImode == 1, nothing to be done here */
-       if (vmcr & ICH_VMCR_EOIM_MASK)
-               return;
-
-       lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
-       if (lr == -1) {
-               __vgic_v3_bump_eoicount();
-               return;
-       }
-
-       lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
-
-       /* If priorities or group do not match, the guest has fscked-up. */
-       if (grp != !!(lr_val & ICH_LR_GROUP) ||
-           __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio)
-               return;
-
-       /* Let's now perform the deactivation */
-       __vgic_v3_clear_active_lr(lr, lr_val);
-}
-
-static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-       vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK));
-}
-
-static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-       vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK));
-}
-
-static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-       u64 val = vcpu_get_reg(vcpu, rt);
-
-       if (val & 1)
-               vmcr |= ICH_VMCR_ENG0_MASK;
-       else
-               vmcr &= ~ICH_VMCR_ENG0_MASK;
-
-       __vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-       u64 val = vcpu_get_reg(vcpu, rt);
-
-       if (val & 1)
-               vmcr |= ICH_VMCR_ENG1_MASK;
-       else
-               vmcr &= ~ICH_VMCR_ENG1_MASK;
-
-       __vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-       vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr));
-}
-
-static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-       vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr));
-}
-
-static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-       u64 val = vcpu_get_reg(vcpu, rt);
-       u8 bpr_min = __vgic_v3_bpr_min() - 1;
-
-       /* Enforce BPR limiting */
-       if (val < bpr_min)
-               val = bpr_min;
-
-       val <<= ICH_VMCR_BPR0_SHIFT;
-       val &= ICH_VMCR_BPR0_MASK;
-       vmcr &= ~ICH_VMCR_BPR0_MASK;
-       vmcr |= val;
-
-       __vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-       u64 val = vcpu_get_reg(vcpu, rt);
-       u8 bpr_min = __vgic_v3_bpr_min();
-
-       if (vmcr & ICH_VMCR_CBPR_MASK)
-               return;
-
-       /* Enforce BPR limiting */
-       if (val < bpr_min)
-               val = bpr_min;
-
-       val <<= ICH_VMCR_BPR1_SHIFT;
-       val &= ICH_VMCR_BPR1_MASK;
-       vmcr &= ~ICH_VMCR_BPR1_MASK;
-       vmcr |= val;
-
-       __vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
-{
-       u32 val;
-
-       if (!__vgic_v3_get_group(vcpu))
-               val = __vgic_v3_read_ap0rn(n);
-       else
-               val = __vgic_v3_read_ap1rn(n);
-
-       vcpu_set_reg(vcpu, rt, val);
-}
-
-static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
-{
-       u32 val = vcpu_get_reg(vcpu, rt);
-
-       if (!__vgic_v3_get_group(vcpu))
-               __vgic_v3_write_ap0rn(val, n);
-       else
-               __vgic_v3_write_ap1rn(val, n);
-}
-
-static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu,
-                                           u32 vmcr, int rt)
-{
-       __vgic_v3_read_apxrn(vcpu, rt, 0);
-}
-
-static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu,
-                                           u32 vmcr, int rt)
-{
-       __vgic_v3_read_apxrn(vcpu, rt, 1);
-}
-
-static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu,
-                                           u32 vmcr, int rt)
-{
-       __vgic_v3_read_apxrn(vcpu, rt, 2);
-}
-
-static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu,
-                                           u32 vmcr, int rt)
-{
-       __vgic_v3_read_apxrn(vcpu, rt, 3);
-}
-
-static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu,
-                                            u32 vmcr, int rt)
-{
-       __vgic_v3_write_apxrn(vcpu, rt, 0);
-}
-
-static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu,
-                                            u32 vmcr, int rt)
-{
-       __vgic_v3_write_apxrn(vcpu, rt, 1);
-}
-
-static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu,
-                                            u32 vmcr, int rt)
-{
-       __vgic_v3_write_apxrn(vcpu, rt, 2);
-}
-
-static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu,
-                                            u32 vmcr, int rt)
-{
-       __vgic_v3_write_apxrn(vcpu, rt, 3);
-}
-
-static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu,
-                                           u32 vmcr, int rt)
-{
-       u64 lr_val;
-       int lr, lr_grp, grp;
-
-       grp = __vgic_v3_get_group(vcpu);
-
-       lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
-       if (lr == -1)
-               goto spurious;
-
-       lr_grp = !!(lr_val & ICH_LR_GROUP);
-       if (lr_grp != grp)
-               lr_val = ICC_IAR1_EL1_SPURIOUS;
-
-spurious:
-       vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
-}
-
-static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu,
-                                         u32 vmcr, int rt)
-{
-       vmcr &= ICH_VMCR_PMR_MASK;
-       vmcr >>= ICH_VMCR_PMR_SHIFT;
-       vcpu_set_reg(vcpu, rt, vmcr);
-}
-
-static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu,
-                                          u32 vmcr, int rt)
-{
-       u32 val = vcpu_get_reg(vcpu, rt);
-
-       val <<= ICH_VMCR_PMR_SHIFT;
-       val &= ICH_VMCR_PMR_MASK;
-       vmcr &= ~ICH_VMCR_PMR_MASK;
-       vmcr |= val;
-
-       write_gicreg(vmcr, ICH_VMCR_EL2);
-}
-
-static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu,
-                                         u32 vmcr, int rt)
-{
-       u32 val = __vgic_v3_get_highest_active_priority();
-       vcpu_set_reg(vcpu, rt, val);
-}
-
-static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu,
-                                          u32 vmcr, int rt)
-{
-       u32 vtr, val;
-
-       vtr = read_gicreg(ICH_VTR_EL2);
-       /* PRIbits */
-       val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
-       /* IDbits */
-       val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
-       /* SEIS */
-       val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT;
-       /* A3V */
-       val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
-       /* EOImode */
-       val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT;
-       /* CBPR */
-       val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
-
-       vcpu_set_reg(vcpu, rt, val);
-}
-
-static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu,
-                                           u32 vmcr, int rt)
-{
-       u32 val = vcpu_get_reg(vcpu, rt);
-
-       if (val & ICC_CTLR_EL1_CBPR_MASK)
-               vmcr |= ICH_VMCR_CBPR_MASK;
-       else
-               vmcr &= ~ICH_VMCR_CBPR_MASK;
-
-       if (val & ICC_CTLR_EL1_EOImode_MASK)
-               vmcr |= ICH_VMCR_EOIM_MASK;
-       else
-               vmcr &= ~ICH_VMCR_EOIM_MASK;
-
-       write_gicreg(vmcr, ICH_VMCR_EL2);
-}
-
-int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
-{
-       int rt;
-       u32 esr;
-       u32 vmcr;
-       void (*fn)(struct kvm_vcpu *, u32, int);
-       bool is_read;
-       u32 sysreg;
-
-       esr = kvm_vcpu_get_hsr(vcpu);
-       if (vcpu_mode_is_32bit(vcpu)) {
-               if (!kvm_condition_valid(vcpu)) {
-                       __kvm_skip_instr(vcpu);
-                       return 1;
-               }
-
-               sysreg = esr_cp15_to_sysreg(esr);
-       } else {
-               sysreg = esr_sys64_to_sysreg(esr);
-       }
-
-       is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
-
-       switch (sysreg) {
-       case SYS_ICC_IAR0_EL1:
-       case SYS_ICC_IAR1_EL1:
-               if (unlikely(!is_read))
-                       return 0;
-               fn = __vgic_v3_read_iar;
-               break;
-       case SYS_ICC_EOIR0_EL1:
-       case SYS_ICC_EOIR1_EL1:
-               if (unlikely(is_read))
-                       return 0;
-               fn = __vgic_v3_write_eoir;
-               break;
-       case SYS_ICC_IGRPEN1_EL1:
-               if (is_read)
-                       fn = __vgic_v3_read_igrpen1;
-               else
-                       fn = __vgic_v3_write_igrpen1;
-               break;
-       case SYS_ICC_BPR1_EL1:
-               if (is_read)
-                       fn = __vgic_v3_read_bpr1;
-               else
-                       fn = __vgic_v3_write_bpr1;
-               break;
-       case SYS_ICC_AP0Rn_EL1(0):
-       case SYS_ICC_AP1Rn_EL1(0):
-               if (is_read)
-                       fn = __vgic_v3_read_apxr0;
-               else
-                       fn = __vgic_v3_write_apxr0;
-               break;
-       case SYS_ICC_AP0Rn_EL1(1):
-       case SYS_ICC_AP1Rn_EL1(1):
-               if (is_read)
-                       fn = __vgic_v3_read_apxr1;
-               else
-                       fn = __vgic_v3_write_apxr1;
-               break;
-       case SYS_ICC_AP0Rn_EL1(2):
-       case SYS_ICC_AP1Rn_EL1(2):
-               if (is_read)
-                       fn = __vgic_v3_read_apxr2;
-               else
-                       fn = __vgic_v3_write_apxr2;
-               break;
-       case SYS_ICC_AP0Rn_EL1(3):
-       case SYS_ICC_AP1Rn_EL1(3):
-               if (is_read)
-                       fn = __vgic_v3_read_apxr3;
-               else
-                       fn = __vgic_v3_write_apxr3;
-               break;
-       case SYS_ICC_HPPIR0_EL1:
-       case SYS_ICC_HPPIR1_EL1:
-               if (unlikely(!is_read))
-                       return 0;
-               fn = __vgic_v3_read_hppir;
-               break;
-       case SYS_ICC_IGRPEN0_EL1:
-               if (is_read)
-                       fn = __vgic_v3_read_igrpen0;
-               else
-                       fn = __vgic_v3_write_igrpen0;
-               break;
-       case SYS_ICC_BPR0_EL1:
-               if (is_read)
-                       fn = __vgic_v3_read_bpr0;
-               else
-                       fn = __vgic_v3_write_bpr0;
-               break;
-       case SYS_ICC_DIR_EL1:
-               if (unlikely(is_read))
-                       return 0;
-               fn = __vgic_v3_write_dir;
-               break;
-       case SYS_ICC_RPR_EL1:
-               if (unlikely(!is_read))
-                       return 0;
-               fn = __vgic_v3_read_rpr;
-               break;
-       case SYS_ICC_CTLR_EL1:
-               if (is_read)
-                       fn = __vgic_v3_read_ctlr;
-               else
-                       fn = __vgic_v3_write_ctlr;
-               break;
-       case SYS_ICC_PMR_EL1:
-               if (is_read)
-                       fn = __vgic_v3_read_pmr;
-               else
-                       fn = __vgic_v3_write_pmr;
-               break;
-       default:
-               return 0;
-       }
-
-       vmcr = __vgic_v3_read_vmcr();
-       rt = kvm_vcpu_sys_get_rt(vcpu);
-       fn(vcpu, vmcr, rt);
-
-       __kvm_skip_instr(vcpu);
-
-       return 1;
-}
-
-#endif
diff --git a/virt/kvm/arm/hypercalls.c b/virt/kvm/arm/hypercalls.c
deleted file mode 100644 (file)
index 550dfa3..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2019 Arm Ltd.
-
-#include <linux/arm-smccc.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_emulate.h>
-
-#include <kvm/arm_hypercalls.h>
-#include <kvm/arm_psci.h>
-
-int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
-{
-       u32 func_id = smccc_get_function(vcpu);
-       long val = SMCCC_RET_NOT_SUPPORTED;
-       u32 feature;
-       gpa_t gpa;
-
-       switch (func_id) {
-       case ARM_SMCCC_VERSION_FUNC_ID:
-               val = ARM_SMCCC_VERSION_1_1;
-               break;
-       case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
-               feature = smccc_get_arg1(vcpu);
-               switch (feature) {
-               case ARM_SMCCC_ARCH_WORKAROUND_1:
-                       switch (kvm_arm_harden_branch_predictor()) {
-                       case KVM_BP_HARDEN_UNKNOWN:
-                               break;
-                       case KVM_BP_HARDEN_WA_NEEDED:
-                               val = SMCCC_RET_SUCCESS;
-                               break;
-                       case KVM_BP_HARDEN_NOT_REQUIRED:
-                               val = SMCCC_RET_NOT_REQUIRED;
-                               break;
-                       }
-                       break;
-               case ARM_SMCCC_ARCH_WORKAROUND_2:
-                       switch (kvm_arm_have_ssbd()) {
-                       case KVM_SSBD_FORCE_DISABLE:
-                       case KVM_SSBD_UNKNOWN:
-                               break;
-                       case KVM_SSBD_KERNEL:
-                               val = SMCCC_RET_SUCCESS;
-                               break;
-                       case KVM_SSBD_FORCE_ENABLE:
-                       case KVM_SSBD_MITIGATED:
-                               val = SMCCC_RET_NOT_REQUIRED;
-                               break;
-                       }
-                       break;
-               case ARM_SMCCC_HV_PV_TIME_FEATURES:
-                       val = SMCCC_RET_SUCCESS;
-                       break;
-               }
-               break;
-       case ARM_SMCCC_HV_PV_TIME_FEATURES:
-               val = kvm_hypercall_pv_features(vcpu);
-               break;
-       case ARM_SMCCC_HV_PV_TIME_ST:
-               gpa = kvm_init_stolen_time(vcpu);
-               if (gpa != GPA_INVALID)
-                       val = gpa;
-               break;
-       default:
-               return kvm_psci_call(vcpu);
-       }
-
-       smccc_set_retval(vcpu, val, 0, 0, 0);
-       return 1;
-}
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
deleted file mode 100644 (file)
index aedfcff..0000000
+++ /dev/null
@@ -1,200 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/kvm_host.h>
-#include <asm/kvm_emulate.h>
-#include <trace/events/kvm.h>
-
-#include "trace.h"
-
-void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data)
-{
-       void *datap = NULL;
-       union {
-               u8      byte;
-               u16     hword;
-               u32     word;
-               u64     dword;
-       } tmp;
-
-       switch (len) {
-       case 1:
-               tmp.byte        = data;
-               datap           = &tmp.byte;
-               break;
-       case 2:
-               tmp.hword       = data;
-               datap           = &tmp.hword;
-               break;
-       case 4:
-               tmp.word        = data;
-               datap           = &tmp.word;
-               break;
-       case 8:
-               tmp.dword       = data;
-               datap           = &tmp.dword;
-               break;
-       }
-
-       memcpy(buf, datap, len);
-}
-
-unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
-{
-       unsigned long data = 0;
-       union {
-               u16     hword;
-               u32     word;
-               u64     dword;
-       } tmp;
-
-       switch (len) {
-       case 1:
-               data = *(u8 *)buf;
-               break;
-       case 2:
-               memcpy(&tmp.hword, buf, len);
-               data = tmp.hword;
-               break;
-       case 4:
-               memcpy(&tmp.word, buf, len);
-               data = tmp.word;
-               break;
-       case 8:
-               memcpy(&tmp.dword, buf, len);
-               data = tmp.dword;
-               break;
-       }
-
-       return data;
-}
-
-/**
- * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
- *                          or in-kernel IO emulation
- *
- * @vcpu: The VCPU pointer
- * @run:  The VCPU run struct containing the mmio data
- */
-int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-       unsigned long data;
-       unsigned int len;
-       int mask;
-
-       /* Detect an already handled MMIO return */
-       if (unlikely(!vcpu->mmio_needed))
-               return 0;
-
-       vcpu->mmio_needed = 0;
-
-       if (!kvm_vcpu_dabt_iswrite(vcpu)) {
-               len = kvm_vcpu_dabt_get_as(vcpu);
-               data = kvm_mmio_read_buf(run->mmio.data, len);
-
-               if (kvm_vcpu_dabt_issext(vcpu) &&
-                   len < sizeof(unsigned long)) {
-                       mask = 1U << ((len * 8) - 1);
-                       data = (data ^ mask) - mask;
-               }
-
-               if (!kvm_vcpu_dabt_issf(vcpu))
-                       data = data & 0xffffffff;
-
-               trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
-                              &data);
-               data = vcpu_data_host_to_guest(vcpu, data, len);
-               vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data);
-       }
-
-       /*
-        * The MMIO instruction is emulated and should not be re-executed
-        * in the guest.
-        */
-       kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-
-       return 0;
-}
-
-int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                phys_addr_t fault_ipa)
-{
-       unsigned long data;
-       unsigned long rt;
-       int ret;
-       bool is_write;
-       int len;
-       u8 data_buf[8];
-
-       /*
-        * No valid syndrome? Ask userspace for help if it has
-        * voluntered to do so, and bail out otherwise.
-        */
-       if (!kvm_vcpu_dabt_isvalid(vcpu)) {
-               if (vcpu->kvm->arch.return_nisv_io_abort_to_user) {
-                       run->exit_reason = KVM_EXIT_ARM_NISV;
-                       run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu);
-                       run->arm_nisv.fault_ipa = fault_ipa;
-                       return 0;
-               }
-
-               kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n");
-               return -ENOSYS;
-       }
-
-       /* Page table accesses IO mem: tell guest to fix its TTBR */
-       if (kvm_vcpu_dabt_iss1tw(vcpu)) {
-               kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
-               return 1;
-       }
-
-       /*
-        * Prepare MMIO operation. First decode the syndrome data we get
-        * from the CPU. Then try if some in-kernel emulation feels
-        * responsible, otherwise let user space do its magic.
-        */
-       is_write = kvm_vcpu_dabt_iswrite(vcpu);
-       len = kvm_vcpu_dabt_get_as(vcpu);
-       rt = kvm_vcpu_dabt_get_rd(vcpu);
-
-       if (is_write) {
-               data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
-                                              len);
-
-               trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data);
-               kvm_mmio_write_buf(data_buf, len, data);
-
-               ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
-                                      data_buf);
-       } else {
-               trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
-                              fault_ipa, NULL);
-
-               ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
-                                     data_buf);
-       }
-
-       /* Now prepare kvm_run for the potential return to userland. */
-       run->mmio.is_write      = is_write;
-       run->mmio.phys_addr     = fault_ipa;
-       run->mmio.len           = len;
-       vcpu->mmio_needed       = 1;
-
-       if (!ret) {
-               /* We handled the access successfully in the kernel. */
-               if (!is_write)
-                       memcpy(run->mmio.data, data_buf, len);
-               vcpu->stat.mmio_exit_kernel++;
-               kvm_handle_mmio_return(vcpu, run);
-               return 1;
-       }
-
-       if (is_write)
-               memcpy(run->mmio.data, data_buf, len);
-       vcpu->stat.mmio_exit_user++;
-       run->exit_reason        = KVM_EXIT_MMIO;
-       return 0;
-}
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
deleted file mode 100644 (file)
index e3b9ee2..0000000
+++ /dev/null
@@ -1,2447 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/mman.h>
-#include <linux/kvm_host.h>
-#include <linux/io.h>
-#include <linux/hugetlb.h>
-#include <linux/sched/signal.h>
-#include <trace/events/kvm.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-#include <asm/kvm_ras.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_emulate.h>
-#include <asm/virt.h>
-
-#include "trace.h"
-
-static pgd_t *boot_hyp_pgd;
-static pgd_t *hyp_pgd;
-static pgd_t *merged_hyp_pgd;
-static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
-
-static unsigned long hyp_idmap_start;
-static unsigned long hyp_idmap_end;
-static phys_addr_t hyp_idmap_vector;
-
-static unsigned long io_map_base;
-
-#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
-
-#define KVM_S2PTE_FLAG_IS_IOMAP                (1UL << 0)
-#define KVM_S2_FLAG_LOGGING_ACTIVE     (1UL << 1)
-
-static bool is_iomap(unsigned long flags)
-{
-       return flags & KVM_S2PTE_FLAG_IS_IOMAP;
-}
-
-static bool memslot_is_logging(struct kvm_memory_slot *memslot)
-{
-       return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
-}
-
-/**
- * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
- * @kvm:       pointer to kvm structure.
- *
- * Interface to HYP function to flush all VM TLB entries
- */
-void kvm_flush_remote_tlbs(struct kvm *kvm)
-{
-       kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
-}
-
-static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
-{
-       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
-}
-
-/*
- * D-Cache management functions. They take the page table entries by
- * value, as they are flushing the cache using the kernel mapping (or
- * kmap on 32bit).
- */
-static void kvm_flush_dcache_pte(pte_t pte)
-{
-       __kvm_flush_dcache_pte(pte);
-}
-
-static void kvm_flush_dcache_pmd(pmd_t pmd)
-{
-       __kvm_flush_dcache_pmd(pmd);
-}
-
-static void kvm_flush_dcache_pud(pud_t pud)
-{
-       __kvm_flush_dcache_pud(pud);
-}
-
-static bool kvm_is_device_pfn(unsigned long pfn)
-{
-       return !pfn_valid(pfn);
-}
-
-/**
- * stage2_dissolve_pmd() - clear and flush huge PMD entry
- * @kvm:       pointer to kvm structure.
- * @addr:      IPA
- * @pmd:       pmd pointer for IPA
- *
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
-{
-       if (!pmd_thp_or_huge(*pmd))
-               return;
-
-       pmd_clear(pmd);
-       kvm_tlb_flush_vmid_ipa(kvm, addr);
-       put_page(virt_to_page(pmd));
-}
-
-/**
- * stage2_dissolve_pud() - clear and flush huge PUD entry
- * @kvm:       pointer to kvm structure.
- * @addr:      IPA
- * @pud:       pud pointer for IPA
- *
- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
-{
-       if (!stage2_pud_huge(kvm, *pudp))
-               return;
-
-       stage2_pud_clear(kvm, pudp);
-       kvm_tlb_flush_vmid_ipa(kvm, addr);
-       put_page(virt_to_page(pudp));
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-                                 int min, int max)
-{
-       void *page;
-
-       BUG_ON(max > KVM_NR_MEM_OBJS);
-       if (cache->nobjs >= min)
-               return 0;
-       while (cache->nobjs < max) {
-               page = (void *)__get_free_page(GFP_PGTABLE_USER);
-               if (!page)
-                       return -ENOMEM;
-               cache->objects[cache->nobjs++] = page;
-       }
-       return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
-       while (mc->nobjs)
-               free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
-{
-       void *p;
-
-       BUG_ON(!mc || !mc->nobjs);
-       p = mc->objects[--mc->nobjs];
-       return p;
-}
-
-static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
-{
-       pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL);
-       stage2_pgd_clear(kvm, pgd);
-       kvm_tlb_flush_vmid_ipa(kvm, addr);
-       stage2_pud_free(kvm, pud_table);
-       put_page(virt_to_page(pgd));
-}
-
-static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
-{
-       pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
-       VM_BUG_ON(stage2_pud_huge(kvm, *pud));
-       stage2_pud_clear(kvm, pud);
-       kvm_tlb_flush_vmid_ipa(kvm, addr);
-       stage2_pmd_free(kvm, pmd_table);
-       put_page(virt_to_page(pud));
-}
-
-static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
-{
-       pte_t *pte_table = pte_offset_kernel(pmd, 0);
-       VM_BUG_ON(pmd_thp_or_huge(*pmd));
-       pmd_clear(pmd);
-       kvm_tlb_flush_vmid_ipa(kvm, addr);
-       free_page((unsigned long)pte_table);
-       put_page(virt_to_page(pmd));
-}
-
-static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
-{
-       WRITE_ONCE(*ptep, new_pte);
-       dsb(ishst);
-}
-
-static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
-{
-       WRITE_ONCE(*pmdp, new_pmd);
-       dsb(ishst);
-}
-
-static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
-{
-       kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
-}
-
-static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
-{
-       WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
-       dsb(ishst);
-}
-
-static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp)
-{
-       WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp));
-       dsb(ishst);
-}
-
-/*
- * Unmapping vs dcache management:
- *
- * If a guest maps certain memory pages as uncached, all writes will
- * bypass the data cache and go directly to RAM.  However, the CPUs
- * can still speculate reads (not writes) and fill cache lines with
- * data.
- *
- * Those cache lines will be *clean* cache lines though, so a
- * clean+invalidate operation is equivalent to an invalidate
- * operation, because no cache lines are marked dirty.
- *
- * Those clean cache lines could be filled prior to an uncached write
- * by the guest, and the cache coherent IO subsystem would therefore
- * end up writing old data to disk.
- *
- * This is why right after unmapping a page/section and invalidating
- * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
- * the IO subsystem will never hit in the cache.
- *
- * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
- * we then fully enforce cacheability of RAM, no matter what the guest
- * does.
- */
-static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
-                      phys_addr_t addr, phys_addr_t end)
-{
-       phys_addr_t start_addr = addr;
-       pte_t *pte, *start_pte;
-
-       start_pte = pte = pte_offset_kernel(pmd, addr);
-       do {
-               if (!pte_none(*pte)) {
-                       pte_t old_pte = *pte;
-
-                       kvm_set_pte(pte, __pte(0));
-                       kvm_tlb_flush_vmid_ipa(kvm, addr);
-
-                       /* No need to invalidate the cache for device mappings */
-                       if (!kvm_is_device_pfn(pte_pfn(old_pte)))
-                               kvm_flush_dcache_pte(old_pte);
-
-                       put_page(virt_to_page(pte));
-               }
-       } while (pte++, addr += PAGE_SIZE, addr != end);
-
-       if (stage2_pte_table_empty(kvm, start_pte))
-               clear_stage2_pmd_entry(kvm, pmd, start_addr);
-}
-
-static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
-                      phys_addr_t addr, phys_addr_t end)
-{
-       phys_addr_t next, start_addr = addr;
-       pmd_t *pmd, *start_pmd;
-
-       start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
-       do {
-               next = stage2_pmd_addr_end(kvm, addr, end);
-               if (!pmd_none(*pmd)) {
-                       if (pmd_thp_or_huge(*pmd)) {
-                               pmd_t old_pmd = *pmd;
-
-                               pmd_clear(pmd);
-                               kvm_tlb_flush_vmid_ipa(kvm, addr);
-
-                               kvm_flush_dcache_pmd(old_pmd);
-
-                               put_page(virt_to_page(pmd));
-                       } else {
-                               unmap_stage2_ptes(kvm, pmd, addr, next);
-                       }
-               }
-       } while (pmd++, addr = next, addr != end);
-
-       if (stage2_pmd_table_empty(kvm, start_pmd))
-               clear_stage2_pud_entry(kvm, pud, start_addr);
-}
-
-static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
-                      phys_addr_t addr, phys_addr_t end)
-{
-       phys_addr_t next, start_addr = addr;
-       pud_t *pud, *start_pud;
-
-       start_pud = pud = stage2_pud_offset(kvm, pgd, addr);
-       do {
-               next = stage2_pud_addr_end(kvm, addr, end);
-               if (!stage2_pud_none(kvm, *pud)) {
-                       if (stage2_pud_huge(kvm, *pud)) {
-                               pud_t old_pud = *pud;
-
-                               stage2_pud_clear(kvm, pud);
-                               kvm_tlb_flush_vmid_ipa(kvm, addr);
-                               kvm_flush_dcache_pud(old_pud);
-                               put_page(virt_to_page(pud));
-                       } else {
-                               unmap_stage2_pmds(kvm, pud, addr, next);
-                       }
-               }
-       } while (pud++, addr = next, addr != end);
-
-       if (stage2_pud_table_empty(kvm, start_pud))
-               clear_stage2_pgd_entry(kvm, pgd, start_addr);
-}
-
-/**
- * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
- * @kvm:   The VM pointer
- * @start: The intermediate physical base address of the range to unmap
- * @size:  The size of the area to unmap
- *
- * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
- * be called while holding mmu_lock (unless for freeing the stage2 pgd before
- * destroying the VM), otherwise another faulting VCPU may come in and mess
- * with things behind our backs.
- */
-static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
-{
-       pgd_t *pgd;
-       phys_addr_t addr = start, end = start + size;
-       phys_addr_t next;
-
-       assert_spin_locked(&kvm->mmu_lock);
-       WARN_ON(size & ~PAGE_MASK);
-
-       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-       do {
-               /*
-                * Make sure the page table is still active, as another thread
-                * could have possibly freed the page table, while we released
-                * the lock.
-                */
-               if (!READ_ONCE(kvm->arch.pgd))
-                       break;
-               next = stage2_pgd_addr_end(kvm, addr, end);
-               if (!stage2_pgd_none(kvm, *pgd))
-                       unmap_stage2_puds(kvm, pgd, addr, next);
-               /*
-                * If the range is too large, release the kvm->mmu_lock
-                * to prevent starvation and lockup detector warnings.
-                */
-               if (next != end)
-                       cond_resched_lock(&kvm->mmu_lock);
-       } while (pgd++, addr = next, addr != end);
-}
-
-static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
-                             phys_addr_t addr, phys_addr_t end)
-{
-       pte_t *pte;
-
-       pte = pte_offset_kernel(pmd, addr);
-       do {
-               if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
-                       kvm_flush_dcache_pte(*pte);
-       } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
-                             phys_addr_t addr, phys_addr_t end)
-{
-       pmd_t *pmd;
-       phys_addr_t next;
-
-       pmd = stage2_pmd_offset(kvm, pud, addr);
-       do {
-               next = stage2_pmd_addr_end(kvm, addr, end);
-               if (!pmd_none(*pmd)) {
-                       if (pmd_thp_or_huge(*pmd))
-                               kvm_flush_dcache_pmd(*pmd);
-                       else
-                               stage2_flush_ptes(kvm, pmd, addr, next);
-               }
-       } while (pmd++, addr = next, addr != end);
-}
-
-static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
-                             phys_addr_t addr, phys_addr_t end)
-{
-       pud_t *pud;
-       phys_addr_t next;
-
-       pud = stage2_pud_offset(kvm, pgd, addr);
-       do {
-               next = stage2_pud_addr_end(kvm, addr, end);
-               if (!stage2_pud_none(kvm, *pud)) {
-                       if (stage2_pud_huge(kvm, *pud))
-                               kvm_flush_dcache_pud(*pud);
-                       else
-                               stage2_flush_pmds(kvm, pud, addr, next);
-               }
-       } while (pud++, addr = next, addr != end);
-}
-
-static void stage2_flush_memslot(struct kvm *kvm,
-                                struct kvm_memory_slot *memslot)
-{
-       phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
-       phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
-       phys_addr_t next;
-       pgd_t *pgd;
-
-       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-       do {
-               next = stage2_pgd_addr_end(kvm, addr, end);
-               if (!stage2_pgd_none(kvm, *pgd))
-                       stage2_flush_puds(kvm, pgd, addr, next);
-       } while (pgd++, addr = next, addr != end);
-}
-
-/**
- * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
- * @kvm: The struct kvm pointer
- *
- * Go through the stage 2 page tables and invalidate any cache lines
- * backing memory already mapped to the VM.
- */
-static void stage2_flush_vm(struct kvm *kvm)
-{
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       int idx;
-
-       idx = srcu_read_lock(&kvm->srcu);
-       spin_lock(&kvm->mmu_lock);
-
-       slots = kvm_memslots(kvm);
-       kvm_for_each_memslot(memslot, slots)
-               stage2_flush_memslot(kvm, memslot);
-
-       spin_unlock(&kvm->mmu_lock);
-       srcu_read_unlock(&kvm->srcu, idx);
-}
-
-static void clear_hyp_pgd_entry(pgd_t *pgd)
-{
-       pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
-       pgd_clear(pgd);
-       pud_free(NULL, pud_table);
-       put_page(virt_to_page(pgd));
-}
-
-static void clear_hyp_pud_entry(pud_t *pud)
-{
-       pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
-       VM_BUG_ON(pud_huge(*pud));
-       pud_clear(pud);
-       pmd_free(NULL, pmd_table);
-       put_page(virt_to_page(pud));
-}
-
-static void clear_hyp_pmd_entry(pmd_t *pmd)
-{
-       pte_t *pte_table = pte_offset_kernel(pmd, 0);
-       VM_BUG_ON(pmd_thp_or_huge(*pmd));
-       pmd_clear(pmd);
-       pte_free_kernel(NULL, pte_table);
-       put_page(virt_to_page(pmd));
-}
-
-static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-       pte_t *pte, *start_pte;
-
-       start_pte = pte = pte_offset_kernel(pmd, addr);
-       do {
-               if (!pte_none(*pte)) {
-                       kvm_set_pte(pte, __pte(0));
-                       put_page(virt_to_page(pte));
-               }
-       } while (pte++, addr += PAGE_SIZE, addr != end);
-
-       if (hyp_pte_table_empty(start_pte))
-               clear_hyp_pmd_entry(pmd);
-}
-
-static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
-{
-       phys_addr_t next;
-       pmd_t *pmd, *start_pmd;
-
-       start_pmd = pmd = pmd_offset(pud, addr);
-       do {
-               next = pmd_addr_end(addr, end);
-               /* Hyp doesn't use huge pmds */
-               if (!pmd_none(*pmd))
-                       unmap_hyp_ptes(pmd, addr, next);
-       } while (pmd++, addr = next, addr != end);
-
-       if (hyp_pmd_table_empty(start_pmd))
-               clear_hyp_pud_entry(pud);
-}
-
-static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
-{
-       phys_addr_t next;
-       pud_t *pud, *start_pud;
-
-       start_pud = pud = pud_offset(pgd, addr);
-       do {
-               next = pud_addr_end(addr, end);
-               /* Hyp doesn't use huge puds */
-               if (!pud_none(*pud))
-                       unmap_hyp_pmds(pud, addr, next);
-       } while (pud++, addr = next, addr != end);
-
-       if (hyp_pud_table_empty(start_pud))
-               clear_hyp_pgd_entry(pgd);
-}
-
-static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
-{
-       return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
-}
-
-static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
-                             phys_addr_t start, u64 size)
-{
-       pgd_t *pgd;
-       phys_addr_t addr = start, end = start + size;
-       phys_addr_t next;
-
-       /*
-        * We don't unmap anything from HYP, except at the hyp tear down.
-        * Hence, we don't have to invalidate the TLBs here.
-        */
-       pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-       do {
-               next = pgd_addr_end(addr, end);
-               if (!pgd_none(*pgd))
-                       unmap_hyp_puds(pgd, addr, next);
-       } while (pgd++, addr = next, addr != end);
-}
-
-static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
-       __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
-}
-
-static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
-       __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
-}
-
-/**
- * free_hyp_pgds - free Hyp-mode page tables
- *
- * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
- * therefore contains either mappings in the kernel memory area (above
- * PAGE_OFFSET), or device mappings in the idmap range.
- *
- * boot_hyp_pgd should only map the idmap range, and is only used in
- * the extended idmap case.
- */
-void free_hyp_pgds(void)
-{
-       pgd_t *id_pgd;
-
-       mutex_lock(&kvm_hyp_pgd_mutex);
-
-       id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
-
-       if (id_pgd) {
-               /* In case we never called hyp_mmu_init() */
-               if (!io_map_base)
-                       io_map_base = hyp_idmap_start;
-               unmap_hyp_idmap_range(id_pgd, io_map_base,
-                                     hyp_idmap_start + PAGE_SIZE - io_map_base);
-       }
-
-       if (boot_hyp_pgd) {
-               free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
-               boot_hyp_pgd = NULL;
-       }
-
-       if (hyp_pgd) {
-               unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
-                               (uintptr_t)high_memory - PAGE_OFFSET);
-
-               free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
-               hyp_pgd = NULL;
-       }
-       if (merged_hyp_pgd) {
-               clear_page(merged_hyp_pgd);
-               free_page((unsigned long)merged_hyp_pgd);
-               merged_hyp_pgd = NULL;
-       }
-
-       mutex_unlock(&kvm_hyp_pgd_mutex);
-}
-
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
-                                   unsigned long end, unsigned long pfn,
-                                   pgprot_t prot)
-{
-       pte_t *pte;
-       unsigned long addr;
-
-       addr = start;
-       do {
-               pte = pte_offset_kernel(pmd, addr);
-               kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
-               get_page(virt_to_page(pte));
-               pfn++;
-       } while (addr += PAGE_SIZE, addr != end);
-}
-
-static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
-                                  unsigned long end, unsigned long pfn,
-                                  pgprot_t prot)
-{
-       pmd_t *pmd;
-       pte_t *pte;
-       unsigned long addr, next;
-
-       addr = start;
-       do {
-               pmd = pmd_offset(pud, addr);
-
-               BUG_ON(pmd_sect(*pmd));
-
-               if (pmd_none(*pmd)) {
-                       pte = pte_alloc_one_kernel(NULL);
-                       if (!pte) {
-                               kvm_err("Cannot allocate Hyp pte\n");
-                               return -ENOMEM;
-                       }
-                       kvm_pmd_populate(pmd, pte);
-                       get_page(virt_to_page(pmd));
-               }
-
-               next = pmd_addr_end(addr, end);
-
-               create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
-               pfn += (next - addr) >> PAGE_SHIFT;
-       } while (addr = next, addr != end);
-
-       return 0;
-}
-
-static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
-                                  unsigned long end, unsigned long pfn,
-                                  pgprot_t prot)
-{
-       pud_t *pud;
-       pmd_t *pmd;
-       unsigned long addr, next;
-       int ret;
-
-       addr = start;
-       do {
-               pud = pud_offset(pgd, addr);
-
-               if (pud_none_or_clear_bad(pud)) {
-                       pmd = pmd_alloc_one(NULL, addr);
-                       if (!pmd) {
-                               kvm_err("Cannot allocate Hyp pmd\n");
-                               return -ENOMEM;
-                       }
-                       kvm_pud_populate(pud, pmd);
-                       get_page(virt_to_page(pud));
-               }
-
-               next = pud_addr_end(addr, end);
-               ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
-               if (ret)
-                       return ret;
-               pfn += (next - addr) >> PAGE_SHIFT;
-       } while (addr = next, addr != end);
-
-       return 0;
-}
-
-static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
-                                unsigned long start, unsigned long end,
-                                unsigned long pfn, pgprot_t prot)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       unsigned long addr, next;
-       int err = 0;
-
-       mutex_lock(&kvm_hyp_pgd_mutex);
-       addr = start & PAGE_MASK;
-       end = PAGE_ALIGN(end);
-       do {
-               pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-
-               if (pgd_none(*pgd)) {
-                       pud = pud_alloc_one(NULL, addr);
-                       if (!pud) {
-                               kvm_err("Cannot allocate Hyp pud\n");
-                               err = -ENOMEM;
-                               goto out;
-                       }
-                       kvm_pgd_populate(pgd, pud);
-                       get_page(virt_to_page(pgd));
-               }
-
-               next = pgd_addr_end(addr, end);
-               err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
-               if (err)
-                       goto out;
-               pfn += (next - addr) >> PAGE_SHIFT;
-       } while (addr = next, addr != end);
-out:
-       mutex_unlock(&kvm_hyp_pgd_mutex);
-       return err;
-}
-
-static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
-{
-       if (!is_vmalloc_addr(kaddr)) {
-               BUG_ON(!virt_addr_valid(kaddr));
-               return __pa(kaddr);
-       } else {
-               return page_to_phys(vmalloc_to_page(kaddr)) +
-                      offset_in_page(kaddr);
-       }
-}
-
-/**
- * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
- * @from:      The virtual kernel start address of the range
- * @to:                The virtual kernel end address of the range (exclusive)
- * @prot:      The protection to be applied to this range
- *
- * The same virtual address as the kernel virtual address is also used
- * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
- * physical pages.
- */
-int create_hyp_mappings(void *from, void *to, pgprot_t prot)
-{
-       phys_addr_t phys_addr;
-       unsigned long virt_addr;
-       unsigned long start = kern_hyp_va((unsigned long)from);
-       unsigned long end = kern_hyp_va((unsigned long)to);
-
-       if (is_kernel_in_hyp_mode())
-               return 0;
-
-       start = start & PAGE_MASK;
-       end = PAGE_ALIGN(end);
-
-       for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
-               int err;
-
-               phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
-               err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
-                                           virt_addr, virt_addr + PAGE_SIZE,
-                                           __phys_to_pfn(phys_addr),
-                                           prot);
-               if (err)
-                       return err;
-       }
-
-       return 0;
-}
-
-static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
-                                       unsigned long *haddr, pgprot_t prot)
-{
-       pgd_t *pgd = hyp_pgd;
-       unsigned long base;
-       int ret = 0;
-
-       mutex_lock(&kvm_hyp_pgd_mutex);
-
-       /*
-        * This assumes that we we have enough space below the idmap
-        * page to allocate our VAs. If not, the check below will
-        * kick. A potential alternative would be to detect that
-        * overflow and switch to an allocation above the idmap.
-        *
-        * The allocated size is always a multiple of PAGE_SIZE.
-        */
-       size = PAGE_ALIGN(size + offset_in_page(phys_addr));
-       base = io_map_base - size;
-
-       /*
-        * Verify that BIT(VA_BITS - 1) hasn't been flipped by
-        * allocating the new area, as it would indicate we've
-        * overflowed the idmap/IO address range.
-        */
-       if ((base ^ io_map_base) & BIT(VA_BITS - 1))
-               ret = -ENOMEM;
-       else
-               io_map_base = base;
-
-       mutex_unlock(&kvm_hyp_pgd_mutex);
-
-       if (ret)
-               goto out;
-
-       if (__kvm_cpu_uses_extended_idmap())
-               pgd = boot_hyp_pgd;
-
-       ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
-                                   base, base + size,
-                                   __phys_to_pfn(phys_addr), prot);
-       if (ret)
-               goto out;
-
-       *haddr = base + offset_in_page(phys_addr);
-
-out:
-       return ret;
-}
-
-/**
- * create_hyp_io_mappings - Map IO into both kernel and HYP
- * @phys_addr: The physical start address which gets mapped
- * @size:      Size of the region being mapped
- * @kaddr:     Kernel VA for this mapping
- * @haddr:     HYP VA for this mapping
- */
-int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
-                          void __iomem **kaddr,
-                          void __iomem **haddr)
-{
-       unsigned long addr;
-       int ret;
-
-       *kaddr = ioremap(phys_addr, size);
-       if (!*kaddr)
-               return -ENOMEM;
-
-       if (is_kernel_in_hyp_mode()) {
-               *haddr = *kaddr;
-               return 0;
-       }
-
-       ret = __create_hyp_private_mapping(phys_addr, size,
-                                          &addr, PAGE_HYP_DEVICE);
-       if (ret) {
-               iounmap(*kaddr);
-               *kaddr = NULL;
-               *haddr = NULL;
-               return ret;
-       }
-
-       *haddr = (void __iomem *)addr;
-       return 0;
-}
-
-/**
- * create_hyp_exec_mappings - Map an executable range into HYP
- * @phys_addr: The physical start address which gets mapped
- * @size:      Size of the region being mapped
- * @haddr:     HYP VA for this mapping
- */
-int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
-                            void **haddr)
-{
-       unsigned long addr;
-       int ret;
-
-       BUG_ON(is_kernel_in_hyp_mode());
-
-       ret = __create_hyp_private_mapping(phys_addr, size,
-                                          &addr, PAGE_HYP_EXEC);
-       if (ret) {
-               *haddr = NULL;
-               return ret;
-       }
-
-       *haddr = (void *)addr;
-       return 0;
-}
-
-/**
- * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
- * @kvm:       The KVM struct pointer for the VM.
- *
- * Allocates only the stage-2 HW PGD level table(s) of size defined by
- * stage2_pgd_size(kvm).
- *
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_alloc_stage2_pgd(struct kvm *kvm)
-{
-       phys_addr_t pgd_phys;
-       pgd_t *pgd;
-
-       if (kvm->arch.pgd != NULL) {
-               kvm_err("kvm_arch already initialized?\n");
-               return -EINVAL;
-       }
-
-       /* Allocate the HW PGD, making sure that each page gets its own refcount */
-       pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
-       if (!pgd)
-               return -ENOMEM;
-
-       pgd_phys = virt_to_phys(pgd);
-       if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
-               return -EINVAL;
-
-       kvm->arch.pgd = pgd;
-       kvm->arch.pgd_phys = pgd_phys;
-       return 0;
-}
-
-static void stage2_unmap_memslot(struct kvm *kvm,
-                                struct kvm_memory_slot *memslot)
-{
-       hva_t hva = memslot->userspace_addr;
-       phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
-       phys_addr_t size = PAGE_SIZE * memslot->npages;
-       hva_t reg_end = hva + size;
-
-       /*
-        * A memory region could potentially cover multiple VMAs, and any holes
-        * between them, so iterate over all of them to find out if we should
-        * unmap any of them.
-        *
-        *     +--------------------------------------------+
-        * +---------------+----------------+   +----------------+
-        * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
-        * +---------------+----------------+   +----------------+
-        *     |               memory region                |
-        *     +--------------------------------------------+
-        */
-       do {
-               struct vm_area_struct *vma = find_vma(current->mm, hva);
-               hva_t vm_start, vm_end;
-
-               if (!vma || vma->vm_start >= reg_end)
-                       break;
-
-               /*
-                * Take the intersection of this VMA with the memory region
-                */
-               vm_start = max(hva, vma->vm_start);
-               vm_end = min(reg_end, vma->vm_end);
-
-               if (!(vma->vm_flags & VM_PFNMAP)) {
-                       gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
-                       unmap_stage2_range(kvm, gpa, vm_end - vm_start);
-               }
-               hva = vm_end;
-       } while (hva < reg_end);
-}
-
-/**
- * stage2_unmap_vm - Unmap Stage-2 RAM mappings
- * @kvm: The struct kvm pointer
- *
- * Go through the memregions and unmap any reguler RAM
- * backing memory already mapped to the VM.
- */
-void stage2_unmap_vm(struct kvm *kvm)
-{
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       int idx;
-
-       idx = srcu_read_lock(&kvm->srcu);
-       down_read(&current->mm->mmap_sem);
-       spin_lock(&kvm->mmu_lock);
-
-       slots = kvm_memslots(kvm);
-       kvm_for_each_memslot(memslot, slots)
-               stage2_unmap_memslot(kvm, memslot);
-
-       spin_unlock(&kvm->mmu_lock);
-       up_read(&current->mm->mmap_sem);
-       srcu_read_unlock(&kvm->srcu, idx);
-}
-
-/**
- * kvm_free_stage2_pgd - free all stage-2 tables
- * @kvm:       The KVM struct pointer for the VM.
- *
- * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
- * underlying level-2 and level-3 tables before freeing the actual level-1 table
- * and setting the struct pointer to NULL.
- */
-void kvm_free_stage2_pgd(struct kvm *kvm)
-{
-       void *pgd = NULL;
-
-       spin_lock(&kvm->mmu_lock);
-       if (kvm->arch.pgd) {
-               unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
-               pgd = READ_ONCE(kvm->arch.pgd);
-               kvm->arch.pgd = NULL;
-               kvm->arch.pgd_phys = 0;
-       }
-       spin_unlock(&kvm->mmu_lock);
-
-       /* Free the HW pgd, one page at a time */
-       if (pgd)
-               free_pages_exact(pgd, stage2_pgd_size(kvm));
-}
-
-static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-                            phys_addr_t addr)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-
-       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-       if (stage2_pgd_none(kvm, *pgd)) {
-               if (!cache)
-                       return NULL;
-               pud = mmu_memory_cache_alloc(cache);
-               stage2_pgd_populate(kvm, pgd, pud);
-               get_page(virt_to_page(pgd));
-       }
-
-       return stage2_pud_offset(kvm, pgd, addr);
-}
-
-static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-                            phys_addr_t addr)
-{
-       pud_t *pud;
-       pmd_t *pmd;
-
-       pud = stage2_get_pud(kvm, cache, addr);
-       if (!pud || stage2_pud_huge(kvm, *pud))
-               return NULL;
-
-       if (stage2_pud_none(kvm, *pud)) {
-               if (!cache)
-                       return NULL;
-               pmd = mmu_memory_cache_alloc(cache);
-               stage2_pud_populate(kvm, pud, pmd);
-               get_page(virt_to_page(pud));
-       }
-
-       return stage2_pmd_offset(kvm, pud, addr);
-}
-
-static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
-                              *cache, phys_addr_t addr, const pmd_t *new_pmd)
-{
-       pmd_t *pmd, old_pmd;
-
-retry:
-       pmd = stage2_get_pmd(kvm, cache, addr);
-       VM_BUG_ON(!pmd);
-
-       old_pmd = *pmd;
-       /*
-        * Multiple vcpus faulting on the same PMD entry, can
-        * lead to them sequentially updating the PMD with the
-        * same value. Following the break-before-make
-        * (pmd_clear() followed by tlb_flush()) process can
-        * hinder forward progress due to refaults generated
-        * on missing translations.
-        *
-        * Skip updating the page table if the entry is
-        * unchanged.
-        */
-       if (pmd_val(old_pmd) == pmd_val(*new_pmd))
-               return 0;
-
-       if (pmd_present(old_pmd)) {
-               /*
-                * If we already have PTE level mapping for this block,
-                * we must unmap it to avoid inconsistent TLB state and
-                * leaking the table page. We could end up in this situation
-                * if the memory slot was marked for dirty logging and was
-                * reverted, leaving PTE level mappings for the pages accessed
-                * during the period. So, unmap the PTE level mapping for this
-                * block and retry, as we could have released the upper level
-                * table in the process.
-                *
-                * Normal THP split/merge follows mmu_notifier callbacks and do
-                * get handled accordingly.
-                */
-               if (!pmd_thp_or_huge(old_pmd)) {
-                       unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
-                       goto retry;
-               }
-               /*
-                * Mapping in huge pages should only happen through a
-                * fault.  If a page is merged into a transparent huge
-                * page, the individual subpages of that huge page
-                * should be unmapped through MMU notifiers before we
-                * get here.
-                *
-                * Merging of CompoundPages is not supported; they
-                * should become splitting first, unmapped, merged,
-                * and mapped back in on-demand.
-                */
-               WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
-               pmd_clear(pmd);
-               kvm_tlb_flush_vmid_ipa(kvm, addr);
-       } else {
-               get_page(virt_to_page(pmd));
-       }
-
-       kvm_set_pmd(pmd, *new_pmd);
-       return 0;
-}
-
-static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-                              phys_addr_t addr, const pud_t *new_pudp)
-{
-       pud_t *pudp, old_pud;
-
-retry:
-       pudp = stage2_get_pud(kvm, cache, addr);
-       VM_BUG_ON(!pudp);
-
-       old_pud = *pudp;
-
-       /*
-        * A large number of vcpus faulting on the same stage 2 entry,
-        * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
-        * Skip updating the page tables if there is no change.
-        */
-       if (pud_val(old_pud) == pud_val(*new_pudp))
-               return 0;
-
-       if (stage2_pud_present(kvm, old_pud)) {
-               /*
-                * If we already have table level mapping for this block, unmap
-                * the range for this block and retry.
-                */
-               if (!stage2_pud_huge(kvm, old_pud)) {
-                       unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
-                       goto retry;
-               }
-
-               WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
-               stage2_pud_clear(kvm, pudp);
-               kvm_tlb_flush_vmid_ipa(kvm, addr);
-       } else {
-               get_page(virt_to_page(pudp));
-       }
-
-       kvm_set_pud(pudp, *new_pudp);
-       return 0;
-}
-
-/*
- * stage2_get_leaf_entry - walk the stage2 VM page tables and return
- * true if a valid and present leaf-entry is found. A pointer to the
- * leaf-entry is returned in the appropriate level variable - pudpp,
- * pmdpp, ptepp.
- */
-static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
-                                 pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
-{
-       pud_t *pudp;
-       pmd_t *pmdp;
-       pte_t *ptep;
-
-       *pudpp = NULL;
-       *pmdpp = NULL;
-       *ptepp = NULL;
-
-       pudp = stage2_get_pud(kvm, NULL, addr);
-       if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
-               return false;
-
-       if (stage2_pud_huge(kvm, *pudp)) {
-               *pudpp = pudp;
-               return true;
-       }
-
-       pmdp = stage2_pmd_offset(kvm, pudp, addr);
-       if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
-               return false;
-
-       if (pmd_thp_or_huge(*pmdp)) {
-               *pmdpp = pmdp;
-               return true;
-       }
-
-       ptep = pte_offset_kernel(pmdp, addr);
-       if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
-               return false;
-
-       *ptepp = ptep;
-       return true;
-}
-
-static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
-{
-       pud_t *pudp;
-       pmd_t *pmdp;
-       pte_t *ptep;
-       bool found;
-
-       found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
-       if (!found)
-               return false;
-
-       if (pudp)
-               return kvm_s2pud_exec(pudp);
-       else if (pmdp)
-               return kvm_s2pmd_exec(pmdp);
-       else
-               return kvm_s2pte_exec(ptep);
-}
-
-static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-                         phys_addr_t addr, const pte_t *new_pte,
-                         unsigned long flags)
-{
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte, old_pte;
-       bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
-       bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
-
-       VM_BUG_ON(logging_active && !cache);
-
-       /* Create stage-2 page table mapping - Levels 0 and 1 */
-       pud = stage2_get_pud(kvm, cache, addr);
-       if (!pud) {
-               /*
-                * Ignore calls from kvm_set_spte_hva for unallocated
-                * address ranges.
-                */
-               return 0;
-       }
-
-       /*
-        * While dirty page logging - dissolve huge PUD, then continue
-        * on to allocate page.
-        */
-       if (logging_active)
-               stage2_dissolve_pud(kvm, addr, pud);
-
-       if (stage2_pud_none(kvm, *pud)) {
-               if (!cache)
-                       return 0; /* ignore calls from kvm_set_spte_hva */
-               pmd = mmu_memory_cache_alloc(cache);
-               stage2_pud_populate(kvm, pud, pmd);
-               get_page(virt_to_page(pud));
-       }
-
-       pmd = stage2_pmd_offset(kvm, pud, addr);
-       if (!pmd) {
-               /*
-                * Ignore calls from kvm_set_spte_hva for unallocated
-                * address ranges.
-                */
-               return 0;
-       }
-
-       /*
-        * While dirty page logging - dissolve huge PMD, then continue on to
-        * allocate page.
-        */
-       if (logging_active)
-               stage2_dissolve_pmd(kvm, addr, pmd);
-
-       /* Create stage-2 page mappings - Level 2 */
-       if (pmd_none(*pmd)) {
-               if (!cache)
-                       return 0; /* ignore calls from kvm_set_spte_hva */
-               pte = mmu_memory_cache_alloc(cache);
-               kvm_pmd_populate(pmd, pte);
-               get_page(virt_to_page(pmd));
-       }
-
-       pte = pte_offset_kernel(pmd, addr);
-
-       if (iomap && pte_present(*pte))
-               return -EFAULT;
-
-       /* Create 2nd stage page table mapping - Level 3 */
-       old_pte = *pte;
-       if (pte_present(old_pte)) {
-               /* Skip page table update if there is no change */
-               if (pte_val(old_pte) == pte_val(*new_pte))
-                       return 0;
-
-               kvm_set_pte(pte, __pte(0));
-               kvm_tlb_flush_vmid_ipa(kvm, addr);
-       } else {
-               get_page(virt_to_page(pte));
-       }
-
-       kvm_set_pte(pte, *new_pte);
-       return 0;
-}
-
-#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
-       if (pte_young(*pte)) {
-               *pte = pte_mkold(*pte);
-               return 1;
-       }
-       return 0;
-}
-#else
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
-       return __ptep_test_and_clear_young(pte);
-}
-#endif
-
-static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
-{
-       return stage2_ptep_test_and_clear_young((pte_t *)pmd);
-}
-
-static int stage2_pudp_test_and_clear_young(pud_t *pud)
-{
-       return stage2_ptep_test_and_clear_young((pte_t *)pud);
-}
-
-/**
- * kvm_phys_addr_ioremap - map a device range to guest IPA
- *
- * @kvm:       The KVM pointer
- * @guest_ipa: The IPA at which to insert the mapping
- * @pa:                The physical address of the device
- * @size:      The size of the mapping
- */
-int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
-                         phys_addr_t pa, unsigned long size, bool writable)
-{
-       phys_addr_t addr, end;
-       int ret = 0;
-       unsigned long pfn;
-       struct kvm_mmu_memory_cache cache = { 0, };
-
-       end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
-       pfn = __phys_to_pfn(pa);
-
-       for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
-               pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
-
-               if (writable)
-                       pte = kvm_s2pte_mkwrite(pte);
-
-               ret = mmu_topup_memory_cache(&cache,
-                                            kvm_mmu_cache_min_pages(kvm),
-                                            KVM_NR_MEM_OBJS);
-               if (ret)
-                       goto out;
-               spin_lock(&kvm->mmu_lock);
-               ret = stage2_set_pte(kvm, &cache, addr, &pte,
-                                               KVM_S2PTE_FLAG_IS_IOMAP);
-               spin_unlock(&kvm->mmu_lock);
-               if (ret)
-                       goto out;
-
-               pfn++;
-       }
-
-out:
-       mmu_free_memory_cache(&cache);
-       return ret;
-}
-
-static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
-{
-       kvm_pfn_t pfn = *pfnp;
-       gfn_t gfn = *ipap >> PAGE_SHIFT;
-
-       if (kvm_is_transparent_hugepage(pfn)) {
-               unsigned long mask;
-               /*
-                * The address we faulted on is backed by a transparent huge
-                * page.  However, because we map the compound huge page and
-                * not the individual tail page, we need to transfer the
-                * refcount to the head page.  We have to be careful that the
-                * THP doesn't start to split while we are adjusting the
-                * refcounts.
-                *
-                * We are sure this doesn't happen, because mmu_notifier_retry
-                * was successful and we are holding the mmu_lock, so if this
-                * THP is trying to split, it will be blocked in the mmu
-                * notifier before touching any of the pages, specifically
-                * before being able to call __split_huge_page_refcount().
-                *
-                * We can therefore safely transfer the refcount from PG_tail
-                * to PG_head and switch the pfn from a tail page to the head
-                * page accordingly.
-                */
-               mask = PTRS_PER_PMD - 1;
-               VM_BUG_ON((gfn & mask) != (pfn & mask));
-               if (pfn & mask) {
-                       *ipap &= PMD_MASK;
-                       kvm_release_pfn_clean(pfn);
-                       pfn &= ~mask;
-                       kvm_get_pfn(pfn);
-                       *pfnp = pfn;
-               }
-
-               return true;
-       }
-
-       return false;
-}
-
-/**
- * stage2_wp_ptes - write protect PMD range
- * @pmd:       pointer to pmd entry
- * @addr:      range start address
- * @end:       range end address
- */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-       pte_t *pte;
-
-       pte = pte_offset_kernel(pmd, addr);
-       do {
-               if (!pte_none(*pte)) {
-                       if (!kvm_s2pte_readonly(pte))
-                               kvm_set_s2pte_readonly(pte);
-               }
-       } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-/**
- * stage2_wp_pmds - write protect PUD range
- * kvm:                kvm instance for the VM
- * @pud:       pointer to pud entry
- * @addr:      range start address
- * @end:       range end address
- */
-static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
-                          phys_addr_t addr, phys_addr_t end)
-{
-       pmd_t *pmd;
-       phys_addr_t next;
-
-       pmd = stage2_pmd_offset(kvm, pud, addr);
-
-       do {
-               next = stage2_pmd_addr_end(kvm, addr, end);
-               if (!pmd_none(*pmd)) {
-                       if (pmd_thp_or_huge(*pmd)) {
-                               if (!kvm_s2pmd_readonly(pmd))
-                                       kvm_set_s2pmd_readonly(pmd);
-                       } else {
-                               stage2_wp_ptes(pmd, addr, next);
-                       }
-               }
-       } while (pmd++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_puds - write protect PGD range
- * @pgd:       pointer to pgd entry
- * @addr:      range start address
- * @end:       range end address
- */
-static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
-                           phys_addr_t addr, phys_addr_t end)
-{
-       pud_t *pud;
-       phys_addr_t next;
-
-       pud = stage2_pud_offset(kvm, pgd, addr);
-       do {
-               next = stage2_pud_addr_end(kvm, addr, end);
-               if (!stage2_pud_none(kvm, *pud)) {
-                       if (stage2_pud_huge(kvm, *pud)) {
-                               if (!kvm_s2pud_readonly(pud))
-                                       kvm_set_s2pud_readonly(pud);
-                       } else {
-                               stage2_wp_pmds(kvm, pud, addr, next);
-                       }
-               }
-       } while (pud++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_range() - write protect stage2 memory region range
- * @kvm:       The KVM pointer
- * @addr:      Start address of range
- * @end:       End address of range
- */
-static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
-       pgd_t *pgd;
-       phys_addr_t next;
-
-       pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-       do {
-               /*
-                * Release kvm_mmu_lock periodically if the memory region is
-                * large. Otherwise, we may see kernel panics with
-                * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
-                * CONFIG_LOCKDEP. Additionally, holding the lock too long
-                * will also starve other vCPUs. We have to also make sure
-                * that the page tables are not freed while we released
-                * the lock.
-                */
-               cond_resched_lock(&kvm->mmu_lock);
-               if (!READ_ONCE(kvm->arch.pgd))
-                       break;
-               next = stage2_pgd_addr_end(kvm, addr, end);
-               if (stage2_pgd_present(kvm, *pgd))
-                       stage2_wp_puds(kvm, pgd, addr, next);
-       } while (pgd++, addr = next, addr != end);
-}
-
-/**
- * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
- * @kvm:       The KVM pointer
- * @slot:      The memory slot to write protect
- *
- * Called to start logging dirty pages after memory region
- * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
- * all present PUD, PMD and PTEs are write protected in the memory region.
- * Afterwards read of dirty page log can be called.
- *
- * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
- * serializing operations for VM memory regions.
- */
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
-{
-       struct kvm_memslots *slots = kvm_memslots(kvm);
-       struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
-       phys_addr_t start, end;
-
-       if (WARN_ON_ONCE(!memslot))
-               return;
-
-       start = memslot->base_gfn << PAGE_SHIFT;
-       end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
-
-       spin_lock(&kvm->mmu_lock);
-       stage2_wp_range(kvm, start, end);
-       spin_unlock(&kvm->mmu_lock);
-       kvm_flush_remote_tlbs(kvm);
-}
-
-/**
- * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
- * @kvm:       The KVM pointer
- * @slot:      The memory slot associated with mask
- * @gfn_offset:        The gfn offset in memory slot
- * @mask:      The mask of dirty pages at offset 'gfn_offset' in this memory
- *             slot to be write protected
- *
- * Walks bits set in mask write protects the associated pte's. Caller must
- * acquire kvm_mmu_lock.
- */
-static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
-               struct kvm_memory_slot *slot,
-               gfn_t gfn_offset, unsigned long mask)
-{
-       phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
-       phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
-       phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
-
-       stage2_wp_range(kvm, start, end);
-}
-
-/*
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * dirty pages.
- *
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
- */
-void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
-               struct kvm_memory_slot *slot,
-               gfn_t gfn_offset, unsigned long mask)
-{
-       kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
-}
-
-static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-       __clean_dcache_guest_page(pfn, size);
-}
-
-static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-       __invalidate_icache_guest_page(pfn, size);
-}
-
-static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
-{
-       send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
-}
-
-static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
-                                              unsigned long hva,
-                                              unsigned long map_size)
-{
-       gpa_t gpa_start;
-       hva_t uaddr_start, uaddr_end;
-       size_t size;
-
-       size = memslot->npages * PAGE_SIZE;
-
-       gpa_start = memslot->base_gfn << PAGE_SHIFT;
-
-       uaddr_start = memslot->userspace_addr;
-       uaddr_end = uaddr_start + size;
-
-       /*
-        * Pages belonging to memslots that don't have the same alignment
-        * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
-        * PMD/PUD entries, because we'll end up mapping the wrong pages.
-        *
-        * Consider a layout like the following:
-        *
-        *    memslot->userspace_addr:
-        *    +-----+--------------------+--------------------+---+
-        *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
-        *    +-----+--------------------+--------------------+---+
-        *
-        *    memslot->base_gfn << PAGE_SIZE:
-        *      +---+--------------------+--------------------+-----+
-        *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
-        *      +---+--------------------+--------------------+-----+
-        *
-        * If we create those stage-2 blocks, we'll end up with this incorrect
-        * mapping:
-        *   d -> f
-        *   e -> g
-        *   f -> h
-        */
-       if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
-               return false;
-
-       /*
-        * Next, let's make sure we're not trying to map anything not covered
-        * by the memslot. This means we have to prohibit block size mappings
-        * for the beginning and end of a non-block aligned and non-block sized
-        * memory slot (illustrated by the head and tail parts of the
-        * userspace view above containing pages 'abcde' and 'xyz',
-        * respectively).
-        *
-        * Note that it doesn't matter if we do the check using the
-        * userspace_addr or the base_gfn, as both are equally aligned (per
-        * the check above) and equally sized.
-        */
-       return (hva & ~(map_size - 1)) >= uaddr_start &&
-              (hva & ~(map_size - 1)) + map_size <= uaddr_end;
-}
-
-static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
-                         struct kvm_memory_slot *memslot, unsigned long hva,
-                         unsigned long fault_status)
-{
-       int ret;
-       bool write_fault, writable, force_pte = false;
-       bool exec_fault, needs_exec;
-       unsigned long mmu_seq;
-       gfn_t gfn = fault_ipa >> PAGE_SHIFT;
-       struct kvm *kvm = vcpu->kvm;
-       struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
-       struct vm_area_struct *vma;
-       short vma_shift;
-       kvm_pfn_t pfn;
-       pgprot_t mem_type = PAGE_S2;
-       bool logging_active = memslot_is_logging(memslot);
-       unsigned long vma_pagesize, flags = 0;
-
-       write_fault = kvm_is_write_fault(vcpu);
-       exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
-       VM_BUG_ON(write_fault && exec_fault);
-
-       if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
-               kvm_err("Unexpected L2 read permission error\n");
-               return -EFAULT;
-       }
-
-       /* Let's check if we will get back a huge page backed by hugetlbfs */
-       down_read(&current->mm->mmap_sem);
-       vma = find_vma_intersection(current->mm, hva, hva + 1);
-       if (unlikely(!vma)) {
-               kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
-               up_read(&current->mm->mmap_sem);
-               return -EFAULT;
-       }
-
-       if (is_vm_hugetlb_page(vma))
-               vma_shift = huge_page_shift(hstate_vma(vma));
-       else
-               vma_shift = PAGE_SHIFT;
-
-       vma_pagesize = 1ULL << vma_shift;
-       if (logging_active ||
-           (vma->vm_flags & VM_PFNMAP) ||
-           !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
-               force_pte = true;
-               vma_pagesize = PAGE_SIZE;
-       }
-
-       /*
-        * The stage2 has a minimum of 2 level table (For arm64 see
-        * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
-        * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
-        * As for PUD huge maps, we must make sure that we have at least
-        * 3 levels, i.e, PMD is not folded.
-        */
-       if (vma_pagesize == PMD_SIZE ||
-           (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
-               gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
-       up_read(&current->mm->mmap_sem);
-
-       /* We need minimum second+third level pages */
-       ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
-                                    KVM_NR_MEM_OBJS);
-       if (ret)
-               return ret;
-
-       mmu_seq = vcpu->kvm->mmu_notifier_seq;
-       /*
-        * Ensure the read of mmu_notifier_seq happens before we call
-        * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
-        * the page we just got a reference to gets unmapped before we have a
-        * chance to grab the mmu_lock, which ensure that if the page gets
-        * unmapped afterwards, the call to kvm_unmap_hva will take it away
-        * from us again properly. This smp_rmb() interacts with the smp_wmb()
-        * in kvm_mmu_notifier_invalidate_<page|range_end>.
-        */
-       smp_rmb();
-
-       pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
-       if (pfn == KVM_PFN_ERR_HWPOISON) {
-               kvm_send_hwpoison_signal(hva, vma_shift);
-               return 0;
-       }
-       if (is_error_noslot_pfn(pfn))
-               return -EFAULT;
-
-       if (kvm_is_device_pfn(pfn)) {
-               mem_type = PAGE_S2_DEVICE;
-               flags |= KVM_S2PTE_FLAG_IS_IOMAP;
-       } else if (logging_active) {
-               /*
-                * Faults on pages in a memslot with logging enabled
-                * should not be mapped with huge pages (it introduces churn
-                * and performance degradation), so force a pte mapping.
-                */
-               flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
-
-               /*
-                * Only actually map the page as writable if this was a write
-                * fault.
-                */
-               if (!write_fault)
-                       writable = false;
-       }
-
-       if (exec_fault && is_iomap(flags))
-               return -ENOEXEC;
-
-       spin_lock(&kvm->mmu_lock);
-       if (mmu_notifier_retry(kvm, mmu_seq))
-               goto out_unlock;
-
-       if (vma_pagesize == PAGE_SIZE && !force_pte) {
-               /*
-                * Only PMD_SIZE transparent hugepages(THP) are
-                * currently supported. This code will need to be
-                * updated to support other THP sizes.
-                *
-                * Make sure the host VA and the guest IPA are sufficiently
-                * aligned and that the block is contained within the memslot.
-                */
-               if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
-                   transparent_hugepage_adjust(&pfn, &fault_ipa))
-                       vma_pagesize = PMD_SIZE;
-       }
-
-       if (writable)
-               kvm_set_pfn_dirty(pfn);
-
-       if (fault_status != FSC_PERM && !is_iomap(flags))
-               clean_dcache_guest_page(pfn, vma_pagesize);
-
-       if (exec_fault)
-               invalidate_icache_guest_page(pfn, vma_pagesize);
-
-       /*
-        * If we took an execution fault we have made the
-        * icache/dcache coherent above and should now let the s2
-        * mapping be executable.
-        *
-        * Write faults (!exec_fault && FSC_PERM) are orthogonal to
-        * execute permissions, and we preserve whatever we have.
-        */
-       needs_exec = exec_fault ||
-               (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
-
-       if (vma_pagesize == PUD_SIZE) {
-               pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
-
-               new_pud = kvm_pud_mkhuge(new_pud);
-               if (writable)
-                       new_pud = kvm_s2pud_mkwrite(new_pud);
-
-               if (needs_exec)
-                       new_pud = kvm_s2pud_mkexec(new_pud);
-
-               ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
-       } else if (vma_pagesize == PMD_SIZE) {
-               pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
-
-               new_pmd = kvm_pmd_mkhuge(new_pmd);
-
-               if (writable)
-                       new_pmd = kvm_s2pmd_mkwrite(new_pmd);
-
-               if (needs_exec)
-                       new_pmd = kvm_s2pmd_mkexec(new_pmd);
-
-               ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
-       } else {
-               pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
-
-               if (writable) {
-                       new_pte = kvm_s2pte_mkwrite(new_pte);
-                       mark_page_dirty(kvm, gfn);
-               }
-
-               if (needs_exec)
-                       new_pte = kvm_s2pte_mkexec(new_pte);
-
-               ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
-       }
-
-out_unlock:
-       spin_unlock(&kvm->mmu_lock);
-       kvm_set_pfn_accessed(pfn);
-       kvm_release_pfn_clean(pfn);
-       return ret;
-}
-
-/*
- * Resolve the access fault by making the page young again.
- * Note that because the faulting entry is guaranteed not to be
- * cached in the TLB, we don't need to invalidate anything.
- * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
- * so there is no need for atomic (pte|pmd)_mkyoung operations.
- */
-static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
-{
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-       kvm_pfn_t pfn;
-       bool pfn_valid = false;
-
-       trace_kvm_access_fault(fault_ipa);
-
-       spin_lock(&vcpu->kvm->mmu_lock);
-
-       if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
-               goto out;
-
-       if (pud) {              /* HugeTLB */
-               *pud = kvm_s2pud_mkyoung(*pud);
-               pfn = kvm_pud_pfn(*pud);
-               pfn_valid = true;
-       } else  if (pmd) {      /* THP, HugeTLB */
-               *pmd = pmd_mkyoung(*pmd);
-               pfn = pmd_pfn(*pmd);
-               pfn_valid = true;
-       } else {
-               *pte = pte_mkyoung(*pte);       /* Just a page... */
-               pfn = pte_pfn(*pte);
-               pfn_valid = true;
-       }
-
-out:
-       spin_unlock(&vcpu->kvm->mmu_lock);
-       if (pfn_valid)
-               kvm_set_pfn_accessed(pfn);
-}
-
-/**
- * kvm_handle_guest_abort - handles all 2nd stage aborts
- * @vcpu:      the VCPU pointer
- * @run:       the kvm_run structure
- *
- * Any abort that gets to the host is almost guaranteed to be caused by a
- * missing second stage translation table entry, which can mean that either the
- * guest simply needs more memory and we must allocate an appropriate page or it
- * can mean that the guest tried to access I/O memory, which is emulated by user
- * space. The distinction is based on the IPA causing the fault and whether this
- * memory region has been registered as standard RAM by user space.
- */
-int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-       unsigned long fault_status;
-       phys_addr_t fault_ipa;
-       struct kvm_memory_slot *memslot;
-       unsigned long hva;
-       bool is_iabt, write_fault, writable;
-       gfn_t gfn;
-       int ret, idx;
-
-       fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
-
-       fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
-       is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
-
-       /* Synchronous External Abort? */
-       if (kvm_vcpu_dabt_isextabt(vcpu)) {
-               /*
-                * For RAS the host kernel may handle this abort.
-                * There is no need to pass the error into the guest.
-                */
-               if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
-                       return 1;
-
-               if (unlikely(!is_iabt)) {
-                       kvm_inject_vabt(vcpu);
-                       return 1;
-               }
-       }
-
-       trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
-                             kvm_vcpu_get_hfar(vcpu), fault_ipa);
-
-       /* Check the stage-2 fault is trans. fault or write fault */
-       if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
-           fault_status != FSC_ACCESS) {
-               kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
-                       kvm_vcpu_trap_get_class(vcpu),
-                       (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
-                       (unsigned long)kvm_vcpu_get_hsr(vcpu));
-               return -EFAULT;
-       }
-
-       idx = srcu_read_lock(&vcpu->kvm->srcu);
-
-       gfn = fault_ipa >> PAGE_SHIFT;
-       memslot = gfn_to_memslot(vcpu->kvm, gfn);
-       hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
-       write_fault = kvm_is_write_fault(vcpu);
-       if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
-               if (is_iabt) {
-                       /* Prefetch Abort on I/O address */
-                       ret = -ENOEXEC;
-                       goto out;
-               }
-
-               /*
-                * Check for a cache maintenance operation. Since we
-                * ended-up here, we know it is outside of any memory
-                * slot. But we can't find out if that is for a device,
-                * or if the guest is just being stupid. The only thing
-                * we know for sure is that this range cannot be cached.
-                *
-                * So let's assume that the guest is just being
-                * cautious, and skip the instruction.
-                */
-               if (kvm_vcpu_dabt_is_cm(vcpu)) {
-                       kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-                       ret = 1;
-                       goto out_unlock;
-               }
-
-               /*
-                * The IPA is reported as [MAX:12], so we need to
-                * complement it with the bottom 12 bits from the
-                * faulting VA. This is always 12 bits, irrespective
-                * of the page size.
-                */
-               fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
-               ret = io_mem_abort(vcpu, run, fault_ipa);
-               goto out_unlock;
-       }
-
-       /* Userspace should not be able to register out-of-bounds IPAs */
-       VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
-
-       if (fault_status == FSC_ACCESS) {
-               handle_access_fault(vcpu, fault_ipa);
-               ret = 1;
-               goto out_unlock;
-       }
-
-       ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
-       if (ret == 0)
-               ret = 1;
-out:
-       if (ret == -ENOEXEC) {
-               kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
-               ret = 1;
-       }
-out_unlock:
-       srcu_read_unlock(&vcpu->kvm->srcu, idx);
-       return ret;
-}
-
-static int handle_hva_to_gpa(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            int (*handler)(struct kvm *kvm,
-                                           gpa_t gpa, u64 size,
-                                           void *data),
-                            void *data)
-{
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       int ret = 0;
-
-       slots = kvm_memslots(kvm);
-
-       /* we only care about the pages that the guest sees */
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gpa;
-
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-
-               gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
-               ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
-       }
-
-       return ret;
-}
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       unmap_stage2_range(kvm, gpa, size);
-       return 0;
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm,
-                       unsigned long start, unsigned long end)
-{
-       if (!kvm->arch.pgd)
-               return 0;
-
-       trace_kvm_unmap_hva_range(start, end);
-       handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
-       return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       pte_t *pte = (pte_t *)data;
-
-       WARN_ON(size != PAGE_SIZE);
-       /*
-        * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
-        * flag clear because MMU notifiers will have unmapped a huge PMD before
-        * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
-        * therefore stage2_set_pte() never needs to clear out a huge PMD
-        * through this calling path.
-        */
-       stage2_set_pte(kvm, NULL, gpa, pte, 0);
-       return 0;
-}
-
-
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
-{
-       unsigned long end = hva + PAGE_SIZE;
-       kvm_pfn_t pfn = pte_pfn(pte);
-       pte_t stage2_pte;
-
-       if (!kvm->arch.pgd)
-               return 0;
-
-       trace_kvm_set_spte_hva(hva);
-
-       /*
-        * We've moved a page around, probably through CoW, so let's treat it
-        * just like a translation fault and clean the cache to the PoC.
-        */
-       clean_dcache_guest_page(pfn, PAGE_SIZE);
-       stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
-       handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
-
-       return 0;
-}
-
-static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
-       WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-       if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
-               return 0;
-
-       if (pud)
-               return stage2_pudp_test_and_clear_young(pud);
-       else if (pmd)
-               return stage2_pmdp_test_and_clear_young(pmd);
-       else
-               return stage2_ptep_test_and_clear_young(pte);
-}
-
-static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
-       WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-       if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
-               return 0;
-
-       if (pud)
-               return kvm_s2pud_young(*pud);
-       else if (pmd)
-               return pmd_young(*pmd);
-       else
-               return pte_young(*pte);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
-{
-       if (!kvm->arch.pgd)
-               return 0;
-       trace_kvm_age_hva(start, end);
-       return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-       if (!kvm->arch.pgd)
-               return 0;
-       trace_kvm_test_age_hva(hva);
-       return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
-                                kvm_test_age_hva_handler, NULL);
-}
-
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-       mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
-}
-
-phys_addr_t kvm_mmu_get_httbr(void)
-{
-       if (__kvm_cpu_uses_extended_idmap())
-               return virt_to_phys(merged_hyp_pgd);
-       else
-               return virt_to_phys(hyp_pgd);
-}
-
-phys_addr_t kvm_get_idmap_vector(void)
-{
-       return hyp_idmap_vector;
-}
-
-static int kvm_map_idmap_text(pgd_t *pgd)
-{
-       int err;
-
-       /* Create the idmap in the boot page tables */
-       err =   __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
-                                     hyp_idmap_start, hyp_idmap_end,
-                                     __phys_to_pfn(hyp_idmap_start),
-                                     PAGE_HYP_EXEC);
-       if (err)
-               kvm_err("Failed to idmap %lx-%lx\n",
-                       hyp_idmap_start, hyp_idmap_end);
-
-       return err;
-}
-
-int kvm_mmu_init(void)
-{
-       int err;
-
-       hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
-       hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
-       hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
-       hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
-       hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
-
-       /*
-        * We rely on the linker script to ensure at build time that the HYP
-        * init code does not cross a page boundary.
-        */
-       BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
-
-       kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
-       kvm_debug("HYP VA range: %lx:%lx\n",
-                 kern_hyp_va(PAGE_OFFSET),
-                 kern_hyp_va((unsigned long)high_memory - 1));
-
-       if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
-           hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
-           hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
-               /*
-                * The idmap page is intersecting with the VA space,
-                * it is not safe to continue further.
-                */
-               kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
-               err = -EINVAL;
-               goto out;
-       }
-
-       hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
-       if (!hyp_pgd) {
-               kvm_err("Hyp mode PGD not allocated\n");
-               err = -ENOMEM;
-               goto out;
-       }
-
-       if (__kvm_cpu_uses_extended_idmap()) {
-               boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-                                                        hyp_pgd_order);
-               if (!boot_hyp_pgd) {
-                       kvm_err("Hyp boot PGD not allocated\n");
-                       err = -ENOMEM;
-                       goto out;
-               }
-
-               err = kvm_map_idmap_text(boot_hyp_pgd);
-               if (err)
-                       goto out;
-
-               merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-               if (!merged_hyp_pgd) {
-                       kvm_err("Failed to allocate extra HYP pgd\n");
-                       goto out;
-               }
-               __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
-                                   hyp_idmap_start);
-       } else {
-               err = kvm_map_idmap_text(hyp_pgd);
-               if (err)
-                       goto out;
-       }
-
-       io_map_base = hyp_idmap_start;
-       return 0;
-out:
-       free_hyp_pgds();
-       return err;
-}
-
-void kvm_arch_commit_memory_region(struct kvm *kvm,
-                                  const struct kvm_userspace_memory_region *mem,
-                                  struct kvm_memory_slot *old,
-                                  const struct kvm_memory_slot *new,
-                                  enum kvm_mr_change change)
-{
-       /*
-        * At this point memslot has been committed and there is an
-        * allocated dirty_bitmap[], dirty pages will be be tracked while the
-        * memory slot is write protected.
-        */
-       if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
-               kvm_mmu_wp_memory_region(kvm, mem->slot);
-}
-
-int kvm_arch_prepare_memory_region(struct kvm *kvm,
-                                  struct kvm_memory_slot *memslot,
-                                  const struct kvm_userspace_memory_region *mem,
-                                  enum kvm_mr_change change)
-{
-       hva_t hva = mem->userspace_addr;
-       hva_t reg_end = hva + mem->memory_size;
-       bool writable = !(mem->flags & KVM_MEM_READONLY);
-       int ret = 0;
-
-       if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
-                       change != KVM_MR_FLAGS_ONLY)
-               return 0;
-
-       /*
-        * Prevent userspace from creating a memory region outside of the IPA
-        * space addressable by the KVM guest IPA space.
-        */
-       if (memslot->base_gfn + memslot->npages >=
-           (kvm_phys_size(kvm) >> PAGE_SHIFT))
-               return -EFAULT;
-
-       down_read(&current->mm->mmap_sem);
-       /*
-        * A memory region could potentially cover multiple VMAs, and any holes
-        * between them, so iterate over all of them to find out if we can map
-        * any of them right now.
-        *
-        *     +--------------------------------------------+
-        * +---------------+----------------+   +----------------+
-        * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
-        * +---------------+----------------+   +----------------+
-        *     |               memory region                |
-        *     +--------------------------------------------+
-        */
-       do {
-               struct vm_area_struct *vma = find_vma(current->mm, hva);
-               hva_t vm_start, vm_end;
-
-               if (!vma || vma->vm_start >= reg_end)
-                       break;
-
-               /*
-                * Take the intersection of this VMA with the memory region
-                */
-               vm_start = max(hva, vma->vm_start);
-               vm_end = min(reg_end, vma->vm_end);
-
-               if (vma->vm_flags & VM_PFNMAP) {
-                       gpa_t gpa = mem->guest_phys_addr +
-                                   (vm_start - mem->userspace_addr);
-                       phys_addr_t pa;
-
-                       pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
-                       pa += vm_start - vma->vm_start;
-
-                       /* IO region dirty page logging not allowed */
-                       if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
-                               ret = -EINVAL;
-                               goto out;
-                       }
-
-                       ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
-                                                   vm_end - vm_start,
-                                                   writable);
-                       if (ret)
-                               break;
-               }
-               hva = vm_end;
-       } while (hva < reg_end);
-
-       if (change == KVM_MR_FLAGS_ONLY)
-               goto out;
-
-       spin_lock(&kvm->mmu_lock);
-       if (ret)
-               unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
-       else
-               stage2_flush_memslot(kvm, memslot);
-       spin_unlock(&kvm->mmu_lock);
-out:
-       up_read(&current->mm->mmap_sem);
-       return ret;
-}
-
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-}
-
-void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
-{
-}
-
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
-       kvm_free_stage2_pgd(kvm);
-}
-
-void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-                                  struct kvm_memory_slot *slot)
-{
-       gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
-       phys_addr_t size = slot->npages << PAGE_SHIFT;
-
-       spin_lock(&kvm->mmu_lock);
-       unmap_stage2_range(kvm, gpa, size);
-       spin_unlock(&kvm->mmu_lock);
-}
-
-/*
- * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
- *
- * Main problems:
- * - S/W ops are local to a CPU (not broadcast)
- * - We have line migration behind our back (speculation)
- * - System caches don't support S/W at all (damn!)
- *
- * In the face of the above, the best we can do is to try and convert
- * S/W ops to VA ops. Because the guest is not allowed to infer the
- * S/W to PA mapping, it can only use S/W to nuke the whole cache,
- * which is a rather good thing for us.
- *
- * Also, it is only used when turning caches on/off ("The expected
- * usage of the cache maintenance instructions that operate by set/way
- * is associated with the cache maintenance instructions associated
- * with the powerdown and powerup of caches, if this is required by
- * the implementation.").
- *
- * We use the following policy:
- *
- * - If we trap a S/W operation, we enable VM trapping to detect
- *   caches being turned on/off, and do a full clean.
- *
- * - We flush the caches on both caches being turned on and off.
- *
- * - Once the caches are enabled, we stop trapping VM ops.
- */
-void kvm_set_way_flush(struct kvm_vcpu *vcpu)
-{
-       unsigned long hcr = *vcpu_hcr(vcpu);
-
-       /*
-        * If this is the first time we do a S/W operation
-        * (i.e. HCR_TVM not set) flush the whole memory, and set the
-        * VM trapping.
-        *
-        * Otherwise, rely on the VM trapping to wait for the MMU +
-        * Caches to be turned off. At that point, we'll be able to
-        * clean the caches again.
-        */
-       if (!(hcr & HCR_TVM)) {
-               trace_kvm_set_way_flush(*vcpu_pc(vcpu),
-                                       vcpu_has_cache_enabled(vcpu));
-               stage2_flush_vm(vcpu->kvm);
-               *vcpu_hcr(vcpu) = hcr | HCR_TVM;
-       }
-}
-
-void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
-{
-       bool now_enabled = vcpu_has_cache_enabled(vcpu);
-
-       /*
-        * If switching the MMU+caches on, need to invalidate the caches.
-        * If switching it off, need to clean the caches.
-        * Clean + invalidate does the trick always.
-        */
-       if (now_enabled != was_enabled)
-               stage2_flush_vm(vcpu->kvm);
-
-       /* Caches are now on, stop trapping VM ops (until a S/W op) */
-       if (now_enabled)
-               *vcpu_hcr(vcpu) &= ~HCR_TVM;
-
-       trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
-}
diff --git a/virt/kvm/arm/perf.c b/virt/kvm/arm/perf.c
deleted file mode 100644 (file)
index d45b8b9..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Based on the x86 implementation.
- *
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/perf_event.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_emulate.h>
-
-static int kvm_is_in_guest(void)
-{
-        return kvm_get_running_vcpu() != NULL;
-}
-
-static int kvm_is_user_mode(void)
-{
-       struct kvm_vcpu *vcpu;
-
-       vcpu = kvm_get_running_vcpu();
-
-       if (vcpu)
-               return !vcpu_mode_priv(vcpu);
-
-       return 0;
-}
-
-static unsigned long kvm_get_guest_ip(void)
-{
-       struct kvm_vcpu *vcpu;
-
-       vcpu = kvm_get_running_vcpu();
-
-       if (vcpu)
-               return *vcpu_pc(vcpu);
-
-       return 0;
-}
-
-static struct perf_guest_info_callbacks kvm_guest_cbs = {
-       .is_in_guest    = kvm_is_in_guest,
-       .is_user_mode   = kvm_is_user_mode,
-       .get_guest_ip   = kvm_get_guest_ip,
-};
-
-int kvm_perf_init(void)
-{
-       return perf_register_guest_info_callbacks(&kvm_guest_cbs);
-}
-
-int kvm_perf_teardown(void)
-{
-       return perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
-}
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
deleted file mode 100644 (file)
index f0d0312..0000000
+++ /dev/null
@@ -1,869 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Shannon Zhao <shannon.zhao@linaro.org>
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/perf_event.h>
-#include <linux/perf/arm_pmu.h>
-#include <linux/uaccess.h>
-#include <asm/kvm_emulate.h>
-#include <kvm/arm_pmu.h>
-#include <kvm/arm_vgic.h>
-
-static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx);
-static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx);
-static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
-
-#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
-
-/**
- * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
- */
-static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-       return (select_idx == ARMV8_PMU_CYCLE_IDX &&
-               __vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC);
-}
-
-static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc)
-{
-       struct kvm_pmu *pmu;
-       struct kvm_vcpu_arch *vcpu_arch;
-
-       pmc -= pmc->idx;
-       pmu = container_of(pmc, struct kvm_pmu, pmc[0]);
-       vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu);
-       return container_of(vcpu_arch, struct kvm_vcpu, arch);
-}
-
-/**
- * kvm_pmu_pmc_is_chained - determine if the pmc is chained
- * @pmc: The PMU counter pointer
- */
-static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc)
-{
-       struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
-
-       return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
-}
-
-/**
- * kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter
- * @select_idx: The counter index
- */
-static bool kvm_pmu_idx_is_high_counter(u64 select_idx)
-{
-       return select_idx & 0x1;
-}
-
-/**
- * kvm_pmu_get_canonical_pmc - obtain the canonical pmc
- * @pmc: The PMU counter pointer
- *
- * When a pair of PMCs are chained together we use the low counter (canonical)
- * to hold the underlying perf event.
- */
-static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc)
-{
-       if (kvm_pmu_pmc_is_chained(pmc) &&
-           kvm_pmu_idx_is_high_counter(pmc->idx))
-               return pmc - 1;
-
-       return pmc;
-}
-static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc)
-{
-       if (kvm_pmu_idx_is_high_counter(pmc->idx))
-               return pmc - 1;
-       else
-               return pmc + 1;
-}
-
-/**
- * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
- */
-static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-       u64 eventsel, reg;
-
-       select_idx |= 0x1;
-
-       if (select_idx == ARMV8_PMU_CYCLE_IDX)
-               return false;
-
-       reg = PMEVTYPER0_EL0 + select_idx;
-       eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
-
-       return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
-}
-
-/**
- * kvm_pmu_get_pair_counter_value - get PMU counter value
- * @vcpu: The vcpu pointer
- * @pmc: The PMU counter pointer
- */
-static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu,
-                                         struct kvm_pmc *pmc)
-{
-       u64 counter, counter_high, reg, enabled, running;
-
-       if (kvm_pmu_pmc_is_chained(pmc)) {
-               pmc = kvm_pmu_get_canonical_pmc(pmc);
-               reg = PMEVCNTR0_EL0 + pmc->idx;
-
-               counter = __vcpu_sys_reg(vcpu, reg);
-               counter_high = __vcpu_sys_reg(vcpu, reg + 1);
-
-               counter = lower_32_bits(counter) | (counter_high << 32);
-       } else {
-               reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
-                     ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
-               counter = __vcpu_sys_reg(vcpu, reg);
-       }
-
-       /*
-        * The real counter value is equal to the value of counter register plus
-        * the value perf event counts.
-        */
-       if (pmc->perf_event)
-               counter += perf_event_read_value(pmc->perf_event, &enabled,
-                                                &running);
-
-       return counter;
-}
-
-/**
- * kvm_pmu_get_counter_value - get PMU counter value
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
- */
-u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-       u64 counter;
-       struct kvm_pmu *pmu = &vcpu->arch.pmu;
-       struct kvm_pmc *pmc = &pmu->pmc[select_idx];
-
-       counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
-
-       if (kvm_pmu_pmc_is_chained(pmc) &&
-           kvm_pmu_idx_is_high_counter(select_idx))
-               counter = upper_32_bits(counter);
-       else if (select_idx != ARMV8_PMU_CYCLE_IDX)
-               counter = lower_32_bits(counter);
-
-       return counter;
-}
-
-/**
- * kvm_pmu_set_counter_value - set PMU counter value
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
- * @val: The counter value
- */
-void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
-{
-       u64 reg;
-
-       reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
-             ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
-       __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
-
-       /* Recreate the perf event to reflect the updated sample_period */
-       kvm_pmu_create_perf_event(vcpu, select_idx);
-}
-
-/**
- * kvm_pmu_release_perf_event - remove the perf event
- * @pmc: The PMU counter pointer
- */
-static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
-{
-       pmc = kvm_pmu_get_canonical_pmc(pmc);
-       if (pmc->perf_event) {
-               perf_event_disable(pmc->perf_event);
-               perf_event_release_kernel(pmc->perf_event);
-               pmc->perf_event = NULL;
-       }
-}
-
-/**
- * kvm_pmu_stop_counter - stop PMU counter
- * @pmc: The PMU counter pointer
- *
- * If this counter has been configured to monitor some event, release it here.
- */
-static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
-{
-       u64 counter, reg, val;
-
-       pmc = kvm_pmu_get_canonical_pmc(pmc);
-       if (!pmc->perf_event)
-               return;
-
-       counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
-
-       if (pmc->idx == ARMV8_PMU_CYCLE_IDX) {
-               reg = PMCCNTR_EL0;
-               val = counter;
-       } else {
-               reg = PMEVCNTR0_EL0 + pmc->idx;
-               val = lower_32_bits(counter);
-       }
-
-       __vcpu_sys_reg(vcpu, reg) = val;
-
-       if (kvm_pmu_pmc_is_chained(pmc))
-               __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter);
-
-       kvm_pmu_release_perf_event(pmc);
-}
-
-/**
- * kvm_pmu_vcpu_init - assign pmu counter idx for cpu
- * @vcpu: The vcpu pointer
- *
- */
-void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
-{
-       int i;
-       struct kvm_pmu *pmu = &vcpu->arch.pmu;
-
-       for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
-               pmu->pmc[i].idx = i;
-}
-
-/**
- * kvm_pmu_vcpu_reset - reset pmu state for cpu
- * @vcpu: The vcpu pointer
- *
- */
-void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-       unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
-       struct kvm_pmu *pmu = &vcpu->arch.pmu;
-       int i;
-
-       for_each_set_bit(i, &mask, 32)
-               kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]);
-
-       bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS);
-}
-
-/**
- * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu
- * @vcpu: The vcpu pointer
- *
- */
-void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-       int i;
-       struct kvm_pmu *pmu = &vcpu->arch.pmu;
-
-       for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
-               kvm_pmu_release_perf_event(&pmu->pmc[i]);
-}
-
-u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
-{
-       u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT;
-
-       val &= ARMV8_PMU_PMCR_N_MASK;
-       if (val == 0)
-               return BIT(ARMV8_PMU_CYCLE_IDX);
-       else
-               return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX);
-}
-
-/**
- * kvm_pmu_enable_counter_mask - enable selected PMU counters
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCNTENSET register
- *
- * Call perf_event_enable to start counting the perf event
- */
-void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
-{
-       int i;
-       struct kvm_pmu *pmu = &vcpu->arch.pmu;
-       struct kvm_pmc *pmc;
-
-       if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val)
-               return;
-
-       for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
-               if (!(val & BIT(i)))
-                       continue;
-
-               pmc = &pmu->pmc[i];
-
-               /* A change in the enable state may affect the chain state */
-               kvm_pmu_update_pmc_chained(vcpu, i);
-               kvm_pmu_create_perf_event(vcpu, i);
-
-               /* At this point, pmc must be the canonical */
-               if (pmc->perf_event) {
-                       perf_event_enable(pmc->perf_event);
-                       if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
-                               kvm_debug("fail to enable perf event\n");
-               }
-       }
-}
-
-/**
- * kvm_pmu_disable_counter_mask - disable selected PMU counters
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCNTENCLR register
- *
- * Call perf_event_disable to stop counting the perf event
- */
-void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
-{
-       int i;
-       struct kvm_pmu *pmu = &vcpu->arch.pmu;
-       struct kvm_pmc *pmc;
-
-       if (!val)
-               return;
-
-       for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
-               if (!(val & BIT(i)))
-                       continue;
-
-               pmc = &pmu->pmc[i];
-
-               /* A change in the enable state may affect the chain state */
-               kvm_pmu_update_pmc_chained(vcpu, i);
-               kvm_pmu_create_perf_event(vcpu, i);
-
-               /* At this point, pmc must be the canonical */
-               if (pmc->perf_event)
-                       perf_event_disable(pmc->perf_event);
-       }
-}
-
-static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
-{
-       u64 reg = 0;
-
-       if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) {
-               reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
-               reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
-               reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
-               reg &= kvm_pmu_valid_counter_mask(vcpu);
-       }
-
-       return reg;
-}
-
-static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
-{
-       struct kvm_pmu *pmu = &vcpu->arch.pmu;
-       bool overflow;
-
-       if (!kvm_arm_pmu_v3_ready(vcpu))
-               return;
-
-       overflow = !!kvm_pmu_overflow_status(vcpu);
-       if (pmu->irq_level == overflow)
-               return;
-
-       pmu->irq_level = overflow;
-
-       if (likely(irqchip_in_kernel(vcpu->kvm))) {
-               int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-                                             pmu->irq_num, overflow, pmu);
-               WARN_ON(ret);
-       }
-}
-
-bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu)
-{
-       struct kvm_pmu *pmu = &vcpu->arch.pmu;
-       struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
-       bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU;
-
-       if (likely(irqchip_in_kernel(vcpu->kvm)))
-               return false;
-
-       return pmu->irq_level != run_level;
-}
-
-/*
- * Reflect the PMU overflow interrupt output level into the kvm_run structure
- */
-void kvm_pmu_update_run(struct kvm_vcpu *vcpu)
-{
-       struct kvm_sync_regs *regs = &vcpu->run->s.regs;
-
-       /* Populate the timer bitmap for user space */
-       regs->device_irq_level &= ~KVM_ARM_DEV_PMU;
-       if (vcpu->arch.pmu.irq_level)
-               regs->device_irq_level |= KVM_ARM_DEV_PMU;
-}
-
-/**
- * kvm_pmu_flush_hwstate - flush pmu state to cpu
- * @vcpu: The vcpu pointer
- *
- * Check if the PMU has overflowed while we were running in the host, and inject
- * an interrupt if that was the case.
- */
-void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-       kvm_pmu_update_state(vcpu);
-}
-
-/**
- * kvm_pmu_sync_hwstate - sync pmu state from cpu
- * @vcpu: The vcpu pointer
- *
- * Check if the PMU has overflowed while we were running in the guest, and
- * inject an interrupt if that was the case.
- */
-void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-       kvm_pmu_update_state(vcpu);
-}
-
-/**
- * When the perf event overflows, set the overflow status and inform the vcpu.
- */
-static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
-                                 struct perf_sample_data *data,
-                                 struct pt_regs *regs)
-{
-       struct kvm_pmc *pmc = perf_event->overflow_handler_context;
-       struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu);
-       struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
-       int idx = pmc->idx;
-       u64 period;
-
-       cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE);
-
-       /*
-        * Reset the sample period to the architectural limit,
-        * i.e. the point where the counter overflows.
-        */
-       period = -(local64_read(&perf_event->count));
-
-       if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
-               period &= GENMASK(31, 0);
-
-       local64_set(&perf_event->hw.period_left, 0);
-       perf_event->attr.sample_period = period;
-       perf_event->hw.sample_period = period;
-
-       __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
-
-       if (kvm_pmu_overflow_status(vcpu)) {
-               kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-               kvm_vcpu_kick(vcpu);
-       }
-
-       cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD);
-}
-
-/**
- * kvm_pmu_software_increment - do software increment
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMSWINC register
- */
-void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
-{
-       struct kvm_pmu *pmu = &vcpu->arch.pmu;
-       int i;
-
-       if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
-               return;
-
-       /* Weed out disabled counters */
-       val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
-
-       for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) {
-               u64 type, reg;
-
-               if (!(val & BIT(i)))
-                       continue;
-
-               /* PMSWINC only applies to ... SW_INC! */
-               type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
-               type &= ARMV8_PMU_EVTYPE_EVENT;
-               if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
-                       continue;
-
-               /* increment this even SW_INC counter */
-               reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
-               reg = lower_32_bits(reg);
-               __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
-
-               if (reg) /* no overflow on the low part */
-                       continue;
-
-               if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) {
-                       /* increment the high counter */
-                       reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1;
-                       reg = lower_32_bits(reg);
-                       __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg;
-                       if (!reg) /* mark overflow on the high counter */
-                               __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1);
-               } else {
-                       /* mark overflow on low counter */
-                       __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
-               }
-       }
-}
-
-/**
- * kvm_pmu_handle_pmcr - handle PMCR register
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCR register
- */
-void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
-{
-       unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
-       int i;
-
-       if (val & ARMV8_PMU_PMCR_E) {
-               kvm_pmu_enable_counter_mask(vcpu,
-                      __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
-       } else {
-               kvm_pmu_disable_counter_mask(vcpu, mask);
-       }
-
-       if (val & ARMV8_PMU_PMCR_C)
-               kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
-
-       if (val & ARMV8_PMU_PMCR_P) {
-               for_each_set_bit(i, &mask, 32)
-                       kvm_pmu_set_counter_value(vcpu, i, 0);
-       }
-}
-
-static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-       return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
-              (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx));
-}
-
-/**
- * kvm_pmu_create_perf_event - create a perf event for a counter
- * @vcpu: The vcpu pointer
- * @select_idx: The number of selected counter
- */
-static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-       struct kvm_pmu *pmu = &vcpu->arch.pmu;
-       struct kvm_pmc *pmc;
-       struct perf_event *event;
-       struct perf_event_attr attr;
-       u64 eventsel, counter, reg, data;
-
-       /*
-        * For chained counters the event type and filtering attributes are
-        * obtained from the low/even counter. We also use this counter to
-        * determine if the event is enabled/disabled.
-        */
-       pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]);
-
-       reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
-             ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx;
-       data = __vcpu_sys_reg(vcpu, reg);
-
-       kvm_pmu_stop_counter(vcpu, pmc);
-       eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
-
-       /* Software increment event does't need to be backed by a perf event */
-       if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
-           pmc->idx != ARMV8_PMU_CYCLE_IDX)
-               return;
-
-       memset(&attr, 0, sizeof(struct perf_event_attr));
-       attr.type = PERF_TYPE_RAW;
-       attr.size = sizeof(attr);
-       attr.pinned = 1;
-       attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx);
-       attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0;
-       attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
-       attr.exclude_hv = 1; /* Don't count EL2 events */
-       attr.exclude_host = 1; /* Don't count host events */
-       attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
-               ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
-
-       counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
-
-       if (kvm_pmu_pmc_is_chained(pmc)) {
-               /**
-                * The initial sample period (overflow count) of an event. For
-                * chained counters we only support overflow interrupts on the
-                * high counter.
-                */
-               attr.sample_period = (-counter) & GENMASK(63, 0);
-               attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED;
-
-               event = perf_event_create_kernel_counter(&attr, -1, current,
-                                                        kvm_pmu_perf_overflow,
-                                                        pmc + 1);
-       } else {
-               /* The initial sample period (overflow count) of an event. */
-               if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
-                       attr.sample_period = (-counter) & GENMASK(63, 0);
-               else
-                       attr.sample_period = (-counter) & GENMASK(31, 0);
-
-               event = perf_event_create_kernel_counter(&attr, -1, current,
-                                                kvm_pmu_perf_overflow, pmc);
-       }
-
-       if (IS_ERR(event)) {
-               pr_err_once("kvm: pmu event creation failed %ld\n",
-                           PTR_ERR(event));
-               return;
-       }
-
-       pmc->perf_event = event;
-}
-
-/**
- * kvm_pmu_update_pmc_chained - update chained bitmap
- * @vcpu: The vcpu pointer
- * @select_idx: The number of selected counter
- *
- * Update the chained bitmap based on the event type written in the
- * typer register and the enable state of the odd register.
- */
-static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-       struct kvm_pmu *pmu = &vcpu->arch.pmu;
-       struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc;
-       bool new_state, old_state;
-
-       old_state = kvm_pmu_pmc_is_chained(pmc);
-       new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) &&
-                   kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1);
-
-       if (old_state == new_state)
-               return;
-
-       canonical_pmc = kvm_pmu_get_canonical_pmc(pmc);
-       kvm_pmu_stop_counter(vcpu, canonical_pmc);
-       if (new_state) {
-               /*
-                * During promotion from !chained to chained we must ensure
-                * the adjacent counter is stopped and its event destroyed
-                */
-               kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc));
-               set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
-               return;
-       }
-       clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
-}
-
-/**
- * kvm_pmu_set_counter_event_type - set selected counter to monitor some event
- * @vcpu: The vcpu pointer
- * @data: The data guest writes to PMXEVTYPER_EL0
- * @select_idx: The number of selected counter
- *
- * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an
- * event with given hardware event number. Here we call perf_event API to
- * emulate this action and create a kernel perf event for it.
- */
-void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
-                                   u64 select_idx)
-{
-       u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
-
-       reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
-             ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
-
-       __vcpu_sys_reg(vcpu, reg) = event_type;
-
-       kvm_pmu_update_pmc_chained(vcpu, select_idx);
-       kvm_pmu_create_perf_event(vcpu, select_idx);
-}
-
-bool kvm_arm_support_pmu_v3(void)
-{
-       /*
-        * Check if HW_PERF_EVENTS are supported by checking the number of
-        * hardware performance counters. This could ensure the presence of
-        * a physical PMU and CONFIG_PERF_EVENT is selected.
-        */
-       return (perf_num_counters() > 0);
-}
-
-int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
-{
-       if (!vcpu->arch.pmu.created)
-               return 0;
-
-       /*
-        * A valid interrupt configuration for the PMU is either to have a
-        * properly configured interrupt number and using an in-kernel
-        * irqchip, or to not have an in-kernel GIC and not set an IRQ.
-        */
-       if (irqchip_in_kernel(vcpu->kvm)) {
-               int irq = vcpu->arch.pmu.irq_num;
-               if (!kvm_arm_pmu_irq_initialized(vcpu))
-                       return -EINVAL;
-
-               /*
-                * If we are using an in-kernel vgic, at this point we know
-                * the vgic will be initialized, so we can check the PMU irq
-                * number against the dimensions of the vgic and make sure
-                * it's valid.
-                */
-               if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq))
-                       return -EINVAL;
-       } else if (kvm_arm_pmu_irq_initialized(vcpu)) {
-                  return -EINVAL;
-       }
-
-       kvm_pmu_vcpu_reset(vcpu);
-       vcpu->arch.pmu.ready = true;
-
-       return 0;
-}
-
-static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
-{
-       if (!kvm_arm_support_pmu_v3())
-               return -ENODEV;
-
-       if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-               return -ENXIO;
-
-       if (vcpu->arch.pmu.created)
-               return -EBUSY;
-
-       if (irqchip_in_kernel(vcpu->kvm)) {
-               int ret;
-
-               /*
-                * If using the PMU with an in-kernel virtual GIC
-                * implementation, we require the GIC to be already
-                * initialized when initializing the PMU.
-                */
-               if (!vgic_initialized(vcpu->kvm))
-                       return -ENODEV;
-
-               if (!kvm_arm_pmu_irq_initialized(vcpu))
-                       return -ENXIO;
-
-               ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num,
-                                        &vcpu->arch.pmu);
-               if (ret)
-                       return ret;
-       }
-
-       vcpu->arch.pmu.created = true;
-       return 0;
-}
-
-/*
- * For one VM the interrupt type must be same for each vcpu.
- * As a PPI, the interrupt number is the same for all vcpus,
- * while as an SPI it must be a separate number per vcpu.
- */
-static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
-{
-       int i;
-       struct kvm_vcpu *vcpu;
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (!kvm_arm_pmu_irq_initialized(vcpu))
-                       continue;
-
-               if (irq_is_ppi(irq)) {
-                       if (vcpu->arch.pmu.irq_num != irq)
-                               return false;
-               } else {
-                       if (vcpu->arch.pmu.irq_num == irq)
-                               return false;
-               }
-       }
-
-       return true;
-}
-
-int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-       switch (attr->attr) {
-       case KVM_ARM_VCPU_PMU_V3_IRQ: {
-               int __user *uaddr = (int __user *)(long)attr->addr;
-               int irq;
-
-               if (!irqchip_in_kernel(vcpu->kvm))
-                       return -EINVAL;
-
-               if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-                       return -ENODEV;
-
-               if (get_user(irq, uaddr))
-                       return -EFAULT;
-
-               /* The PMU overflow interrupt can be a PPI or a valid SPI. */
-               if (!(irq_is_ppi(irq) || irq_is_spi(irq)))
-                       return -EINVAL;
-
-               if (!pmu_irq_is_valid(vcpu->kvm, irq))
-                       return -EINVAL;
-
-               if (kvm_arm_pmu_irq_initialized(vcpu))
-                       return -EBUSY;
-
-               kvm_debug("Set kvm ARM PMU irq: %d\n", irq);
-               vcpu->arch.pmu.irq_num = irq;
-               return 0;
-       }
-       case KVM_ARM_VCPU_PMU_V3_INIT:
-               return kvm_arm_pmu_v3_init(vcpu);
-       }
-
-       return -ENXIO;
-}
-
-int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-       switch (attr->attr) {
-       case KVM_ARM_VCPU_PMU_V3_IRQ: {
-               int __user *uaddr = (int __user *)(long)attr->addr;
-               int irq;
-
-               if (!irqchip_in_kernel(vcpu->kvm))
-                       return -EINVAL;
-
-               if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-                       return -ENODEV;
-
-               if (!kvm_arm_pmu_irq_initialized(vcpu))
-                       return -ENXIO;
-
-               irq = vcpu->arch.pmu.irq_num;
-               return put_user(irq, uaddr);
-       }
-       }
-
-       return -ENXIO;
-}
-
-int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-       switch (attr->attr) {
-       case KVM_ARM_VCPU_PMU_V3_IRQ:
-       case KVM_ARM_VCPU_PMU_V3_INIT:
-               if (kvm_arm_support_pmu_v3() &&
-                   test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-                       return 0;
-       }
-
-       return -ENXIO;
-}
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
deleted file mode 100644 (file)
index ae36471..0000000
+++ /dev/null
@@ -1,564 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/arm-smccc.h>
-#include <linux/preempt.h>
-#include <linux/kvm_host.h>
-#include <linux/uaccess.h>
-#include <linux/wait.h>
-
-#include <asm/cputype.h>
-#include <asm/kvm_emulate.h>
-
-#include <kvm/arm_psci.h>
-#include <kvm/arm_hypercalls.h>
-
-/*
- * This is an implementation of the Power State Coordination Interface
- * as described in ARM document number ARM DEN 0022A.
- */
-
-#define AFFINITY_MASK(level)   ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
-
-static unsigned long psci_affinity_mask(unsigned long affinity_level)
-{
-       if (affinity_level <= 3)
-               return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);
-
-       return 0;
-}
-
-static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
-{
-       /*
-        * NOTE: For simplicity, we make VCPU suspend emulation to be
-        * same-as WFI (Wait-for-interrupt) emulation.
-        *
-        * This means for KVM the wakeup events are interrupts and
-        * this is consistent with intended use of StateID as described
-        * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A).
-        *
-        * Further, we also treat power-down request to be same as
-        * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2
-        * specification (ARM DEN 0022A). This means all suspend states
-        * for KVM will preserve the register state.
-        */
-       kvm_vcpu_block(vcpu);
-       kvm_clear_request(KVM_REQ_UNHALT, vcpu);
-
-       return PSCI_RET_SUCCESS;
-}
-
-static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.power_off = true;
-       kvm_make_request(KVM_REQ_SLEEP, vcpu);
-       kvm_vcpu_kick(vcpu);
-}
-
-static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
-{
-       struct vcpu_reset_state *reset_state;
-       struct kvm *kvm = source_vcpu->kvm;
-       struct kvm_vcpu *vcpu = NULL;
-       unsigned long cpu_id;
-
-       cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK;
-       if (vcpu_mode_is_32bit(source_vcpu))
-               cpu_id &= ~((u32) 0);
-
-       vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);
-
-       /*
-        * Make sure the caller requested a valid CPU and that the CPU is
-        * turned off.
-        */
-       if (!vcpu)
-               return PSCI_RET_INVALID_PARAMS;
-       if (!vcpu->arch.power_off) {
-               if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1)
-                       return PSCI_RET_ALREADY_ON;
-               else
-                       return PSCI_RET_INVALID_PARAMS;
-       }
-
-       reset_state = &vcpu->arch.reset_state;
-
-       reset_state->pc = smccc_get_arg2(source_vcpu);
-
-       /* Propagate caller endianness */
-       reset_state->be = kvm_vcpu_is_be(source_vcpu);
-
-       /*
-        * NOTE: We always update r0 (or x0) because for PSCI v0.1
-        * the general puspose registers are undefined upon CPU_ON.
-        */
-       reset_state->r0 = smccc_get_arg3(source_vcpu);
-
-       WRITE_ONCE(reset_state->reset, true);
-       kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
-
-       /*
-        * Make sure the reset request is observed if the change to
-        * power_state is observed.
-        */
-       smp_wmb();
-
-       vcpu->arch.power_off = false;
-       kvm_vcpu_wake_up(vcpu);
-
-       return PSCI_RET_SUCCESS;
-}
-
-static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
-{
-       int i, matching_cpus = 0;
-       unsigned long mpidr;
-       unsigned long target_affinity;
-       unsigned long target_affinity_mask;
-       unsigned long lowest_affinity_level;
-       struct kvm *kvm = vcpu->kvm;
-       struct kvm_vcpu *tmp;
-
-       target_affinity = smccc_get_arg1(vcpu);
-       lowest_affinity_level = smccc_get_arg2(vcpu);
-
-       /* Determine target affinity mask */
-       target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
-       if (!target_affinity_mask)
-               return PSCI_RET_INVALID_PARAMS;
-
-       /* Ignore other bits of target affinity */
-       target_affinity &= target_affinity_mask;
-
-       /*
-        * If one or more VCPU matching target affinity are running
-        * then ON else OFF
-        */
-       kvm_for_each_vcpu(i, tmp, kvm) {
-               mpidr = kvm_vcpu_get_mpidr_aff(tmp);
-               if ((mpidr & target_affinity_mask) == target_affinity) {
-                       matching_cpus++;
-                       if (!tmp->arch.power_off)
-                               return PSCI_0_2_AFFINITY_LEVEL_ON;
-               }
-       }
-
-       if (!matching_cpus)
-               return PSCI_RET_INVALID_PARAMS;
-
-       return PSCI_0_2_AFFINITY_LEVEL_OFF;
-}
-
-static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
-{
-       int i;
-       struct kvm_vcpu *tmp;
-
-       /*
-        * The KVM ABI specifies that a system event exit may call KVM_RUN
-        * again and may perform shutdown/reboot at a later time that when the
-        * actual request is made.  Since we are implementing PSCI and a
-        * caller of PSCI reboot and shutdown expects that the system shuts
-        * down or reboots immediately, let's make sure that VCPUs are not run
-        * after this call is handled and before the VCPUs have been
-        * re-initialized.
-        */
-       kvm_for_each_vcpu(i, tmp, vcpu->kvm)
-               tmp->arch.power_off = true;
-       kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
-
-       memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
-       vcpu->run->system_event.type = type;
-       vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
-}
-
-static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
-{
-       kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN);
-}
-
-static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
-{
-       kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
-}
-
-static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
-{
-       int i;
-
-       /*
-        * Zero the input registers' upper 32 bits. They will be fully
-        * zeroed on exit, so we're fine changing them in place.
-        */
-       for (i = 1; i < 4; i++)
-               vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i)));
-}
-
-static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn)
-{
-       switch(fn) {
-       case PSCI_0_2_FN64_CPU_SUSPEND:
-       case PSCI_0_2_FN64_CPU_ON:
-       case PSCI_0_2_FN64_AFFINITY_INFO:
-               /* Disallow these functions for 32bit guests */
-               if (vcpu_mode_is_32bit(vcpu))
-                       return PSCI_RET_NOT_SUPPORTED;
-               break;
-       }
-
-       return 0;
-}
-
-static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
-{
-       struct kvm *kvm = vcpu->kvm;
-       u32 psci_fn = smccc_get_function(vcpu);
-       unsigned long val;
-       int ret = 1;
-
-       val = kvm_psci_check_allowed_function(vcpu, psci_fn);
-       if (val)
-               goto out;
-
-       switch (psci_fn) {
-       case PSCI_0_2_FN_PSCI_VERSION:
-               /*
-                * Bits[31:16] = Major Version = 0
-                * Bits[15:0] = Minor Version = 2
-                */
-               val = KVM_ARM_PSCI_0_2;
-               break;
-       case PSCI_0_2_FN_CPU_SUSPEND:
-       case PSCI_0_2_FN64_CPU_SUSPEND:
-               val = kvm_psci_vcpu_suspend(vcpu);
-               break;
-       case PSCI_0_2_FN_CPU_OFF:
-               kvm_psci_vcpu_off(vcpu);
-               val = PSCI_RET_SUCCESS;
-               break;
-       case PSCI_0_2_FN_CPU_ON:
-               kvm_psci_narrow_to_32bit(vcpu);
-               fallthrough;
-       case PSCI_0_2_FN64_CPU_ON:
-               mutex_lock(&kvm->lock);
-               val = kvm_psci_vcpu_on(vcpu);
-               mutex_unlock(&kvm->lock);
-               break;
-       case PSCI_0_2_FN_AFFINITY_INFO:
-               kvm_psci_narrow_to_32bit(vcpu);
-               fallthrough;
-       case PSCI_0_2_FN64_AFFINITY_INFO:
-               val = kvm_psci_vcpu_affinity_info(vcpu);
-               break;
-       case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
-               /*
-                * Trusted OS is MP hence does not require migration
-                * or
-                * Trusted OS is not present
-                */
-               val = PSCI_0_2_TOS_MP;
-               break;
-       case PSCI_0_2_FN_SYSTEM_OFF:
-               kvm_psci_system_off(vcpu);
-               /*
-                * We should'nt be going back to guest VCPU after
-                * receiving SYSTEM_OFF request.
-                *
-                * If user space accidently/deliberately resumes
-                * guest VCPU after SYSTEM_OFF request then guest
-                * VCPU should see internal failure from PSCI return
-                * value. To achieve this, we preload r0 (or x0) with
-                * PSCI return value INTERNAL_FAILURE.
-                */
-               val = PSCI_RET_INTERNAL_FAILURE;
-               ret = 0;
-               break;
-       case PSCI_0_2_FN_SYSTEM_RESET:
-               kvm_psci_system_reset(vcpu);
-               /*
-                * Same reason as SYSTEM_OFF for preloading r0 (or x0)
-                * with PSCI return value INTERNAL_FAILURE.
-                */
-               val = PSCI_RET_INTERNAL_FAILURE;
-               ret = 0;
-               break;
-       default:
-               val = PSCI_RET_NOT_SUPPORTED;
-               break;
-       }
-
-out:
-       smccc_set_retval(vcpu, val, 0, 0, 0);
-       return ret;
-}
-
-static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
-{
-       u32 psci_fn = smccc_get_function(vcpu);
-       u32 feature;
-       unsigned long val;
-       int ret = 1;
-
-       switch(psci_fn) {
-       case PSCI_0_2_FN_PSCI_VERSION:
-               val = KVM_ARM_PSCI_1_0;
-               break;
-       case PSCI_1_0_FN_PSCI_FEATURES:
-               feature = smccc_get_arg1(vcpu);
-               val = kvm_psci_check_allowed_function(vcpu, feature);
-               if (val)
-                       break;
-
-               switch(feature) {
-               case PSCI_0_2_FN_PSCI_VERSION:
-               case PSCI_0_2_FN_CPU_SUSPEND:
-               case PSCI_0_2_FN64_CPU_SUSPEND:
-               case PSCI_0_2_FN_CPU_OFF:
-               case PSCI_0_2_FN_CPU_ON:
-               case PSCI_0_2_FN64_CPU_ON:
-               case PSCI_0_2_FN_AFFINITY_INFO:
-               case PSCI_0_2_FN64_AFFINITY_INFO:
-               case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
-               case PSCI_0_2_FN_SYSTEM_OFF:
-               case PSCI_0_2_FN_SYSTEM_RESET:
-               case PSCI_1_0_FN_PSCI_FEATURES:
-               case ARM_SMCCC_VERSION_FUNC_ID:
-                       val = 0;
-                       break;
-               default:
-                       val = PSCI_RET_NOT_SUPPORTED;
-                       break;
-               }
-               break;
-       default:
-               return kvm_psci_0_2_call(vcpu);
-       }
-
-       smccc_set_retval(vcpu, val, 0, 0, 0);
-       return ret;
-}
-
-static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
-{
-       struct kvm *kvm = vcpu->kvm;
-       u32 psci_fn = smccc_get_function(vcpu);
-       unsigned long val;
-
-       switch (psci_fn) {
-       case KVM_PSCI_FN_CPU_OFF:
-               kvm_psci_vcpu_off(vcpu);
-               val = PSCI_RET_SUCCESS;
-               break;
-       case KVM_PSCI_FN_CPU_ON:
-               mutex_lock(&kvm->lock);
-               val = kvm_psci_vcpu_on(vcpu);
-               mutex_unlock(&kvm->lock);
-               break;
-       default:
-               val = PSCI_RET_NOT_SUPPORTED;
-               break;
-       }
-
-       smccc_set_retval(vcpu, val, 0, 0, 0);
-       return 1;
-}
-
-/**
- * kvm_psci_call - handle PSCI call if r0 value is in range
- * @vcpu: Pointer to the VCPU struct
- *
- * Handle PSCI calls from guests through traps from HVC instructions.
- * The calling convention is similar to SMC calls to the secure world
- * where the function number is placed in r0.
- *
- * This function returns: > 0 (success), 0 (success but exit to user
- * space), and < 0 (errors)
- *
- * Errors:
- * -EINVAL: Unrecognized PSCI function
- */
-int kvm_psci_call(struct kvm_vcpu *vcpu)
-{
-       switch (kvm_psci_version(vcpu, vcpu->kvm)) {
-       case KVM_ARM_PSCI_1_0:
-               return kvm_psci_1_0_call(vcpu);
-       case KVM_ARM_PSCI_0_2:
-               return kvm_psci_0_2_call(vcpu);
-       case KVM_ARM_PSCI_0_1:
-               return kvm_psci_0_1_call(vcpu);
-       default:
-               return -EINVAL;
-       };
-}
-
-int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
-{
-       return 3;               /* PSCI version and two workaround registers */
-}
-
-int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
-{
-       if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices++))
-               return -EFAULT;
-
-       if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, uindices++))
-               return -EFAULT;
-
-       if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++))
-               return -EFAULT;
-
-       return 0;
-}
-
-#define KVM_REG_FEATURE_LEVEL_WIDTH    4
-#define KVM_REG_FEATURE_LEVEL_MASK     (BIT(KVM_REG_FEATURE_LEVEL_WIDTH) - 1)
-
-/*
- * Convert the workaround level into an easy-to-compare number, where higher
- * values mean better protection.
- */
-static int get_kernel_wa_level(u64 regid)
-{
-       switch (regid) {
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-               switch (kvm_arm_harden_branch_predictor()) {
-               case KVM_BP_HARDEN_UNKNOWN:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
-               case KVM_BP_HARDEN_WA_NEEDED:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
-               case KVM_BP_HARDEN_NOT_REQUIRED:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
-               }
-               return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-               switch (kvm_arm_have_ssbd()) {
-               case KVM_SSBD_FORCE_DISABLE:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
-               case KVM_SSBD_KERNEL:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL;
-               case KVM_SSBD_FORCE_ENABLE:
-               case KVM_SSBD_MITIGATED:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
-               case KVM_SSBD_UNKNOWN:
-               default:
-                       return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN;
-               }
-       }
-
-       return -EINVAL;
-}
-
-int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
-{
-       void __user *uaddr = (void __user *)(long)reg->addr;
-       u64 val;
-
-       switch (reg->id) {
-       case KVM_REG_ARM_PSCI_VERSION:
-               val = kvm_psci_version(vcpu, vcpu->kvm);
-               break;
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-               val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
-               break;
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-               val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
-
-               if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
-                   kvm_arm_get_vcpu_workaround_2_flag(vcpu))
-                       val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED;
-               break;
-       default:
-               return -ENOENT;
-       }
-
-       if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)))
-               return -EFAULT;
-
-       return 0;
-}
-
-int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
-{
-       void __user *uaddr = (void __user *)(long)reg->addr;
-       u64 val;
-       int wa_level;
-
-       if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)))
-               return -EFAULT;
-
-       switch (reg->id) {
-       case KVM_REG_ARM_PSCI_VERSION:
-       {
-               bool wants_02;
-
-               wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features);
-
-               switch (val) {
-               case KVM_ARM_PSCI_0_1:
-                       if (wants_02)
-                               return -EINVAL;
-                       vcpu->kvm->arch.psci_version = val;
-                       return 0;
-               case KVM_ARM_PSCI_0_2:
-               case KVM_ARM_PSCI_1_0:
-                       if (!wants_02)
-                               return -EINVAL;
-                       vcpu->kvm->arch.psci_version = val;
-                       return 0;
-               }
-               break;
-       }
-
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-               if (val & ~KVM_REG_FEATURE_LEVEL_MASK)
-                       return -EINVAL;
-
-               if (get_kernel_wa_level(reg->id) < val)
-                       return -EINVAL;
-
-               return 0;
-
-       case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-               if (val & ~(KVM_REG_FEATURE_LEVEL_MASK |
-                           KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
-                       return -EINVAL;
-
-               wa_level = val & KVM_REG_FEATURE_LEVEL_MASK;
-
-               if (get_kernel_wa_level(reg->id) < wa_level)
-                       return -EINVAL;
-
-               /* The enabled bit must not be set unless the level is AVAIL. */
-               if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
-                   wa_level != val)
-                       return -EINVAL;
-
-               /* Are we finished or do we need to check the enable bit ? */
-               if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL)
-                       return 0;
-
-               /*
-                * If this kernel supports the workaround to be switched on
-                * or off, make sure it matches the requested setting.
-                */
-               switch (wa_level) {
-               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
-                       kvm_arm_set_vcpu_workaround_2_flag(vcpu,
-                           val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED);
-                       break;
-               case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
-                       kvm_arm_set_vcpu_workaround_2_flag(vcpu, true);
-                       break;
-               }
-
-               return 0;
-       default:
-               return -ENOENT;
-       }
-
-       return -EINVAL;
-}
diff --git a/virt/kvm/arm/pvtime.c b/virt/kvm/arm/pvtime.c
deleted file mode 100644 (file)
index 1e0f4c2..0000000
+++ /dev/null
@@ -1,131 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2019 Arm Ltd.
-
-#include <linux/arm-smccc.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_mmu.h>
-#include <asm/pvclock-abi.h>
-
-#include <kvm/arm_hypercalls.h>
-
-void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
-{
-       struct kvm *kvm = vcpu->kvm;
-       u64 steal;
-       __le64 steal_le;
-       u64 offset;
-       int idx;
-       u64 base = vcpu->arch.steal.base;
-
-       if (base == GPA_INVALID)
-               return;
-
-       /* Let's do the local bookkeeping */
-       steal = vcpu->arch.steal.steal;
-       steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal;
-       vcpu->arch.steal.last_steal = current->sched_info.run_delay;
-       vcpu->arch.steal.steal = steal;
-
-       steal_le = cpu_to_le64(steal);
-       idx = srcu_read_lock(&kvm->srcu);
-       offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
-       kvm_put_guest(kvm, base + offset, steal_le, u64);
-       srcu_read_unlock(&kvm->srcu, idx);
-}
-
-long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
-{
-       u32 feature = smccc_get_arg1(vcpu);
-       long val = SMCCC_RET_NOT_SUPPORTED;
-
-       switch (feature) {
-       case ARM_SMCCC_HV_PV_TIME_FEATURES:
-       case ARM_SMCCC_HV_PV_TIME_ST:
-               val = SMCCC_RET_SUCCESS;
-               break;
-       }
-
-       return val;
-}
-
-gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
-{
-       struct pvclock_vcpu_stolen_time init_values = {};
-       struct kvm *kvm = vcpu->kvm;
-       u64 base = vcpu->arch.steal.base;
-       int idx;
-
-       if (base == GPA_INVALID)
-               return base;
-
-       /*
-        * Start counting stolen time from the time the guest requests
-        * the feature enabled.
-        */
-       vcpu->arch.steal.steal = 0;
-       vcpu->arch.steal.last_steal = current->sched_info.run_delay;
-
-       idx = srcu_read_lock(&kvm->srcu);
-       kvm_write_guest(kvm, base, &init_values, sizeof(init_values));
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return base;
-}
-
-int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
-                           struct kvm_device_attr *attr)
-{
-       u64 __user *user = (u64 __user *)attr->addr;
-       struct kvm *kvm = vcpu->kvm;
-       u64 ipa;
-       int ret = 0;
-       int idx;
-
-       if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
-               return -ENXIO;
-
-       if (get_user(ipa, user))
-               return -EFAULT;
-       if (!IS_ALIGNED(ipa, 64))
-               return -EINVAL;
-       if (vcpu->arch.steal.base != GPA_INVALID)
-               return -EEXIST;
-
-       /* Check the address is in a valid memslot */
-       idx = srcu_read_lock(&kvm->srcu);
-       if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT)))
-               ret = -EINVAL;
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       if (!ret)
-               vcpu->arch.steal.base = ipa;
-
-       return ret;
-}
-
-int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
-                           struct kvm_device_attr *attr)
-{
-       u64 __user *user = (u64 __user *)attr->addr;
-       u64 ipa;
-
-       if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
-               return -ENXIO;
-
-       ipa = vcpu->arch.steal.base;
-
-       if (put_user(ipa, user))
-               return -EFAULT;
-       return 0;
-}
-
-int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
-                           struct kvm_device_attr *attr)
-{
-       switch (attr->attr) {
-       case KVM_ARM_VCPU_PVTIME_IPA:
-               return 0;
-       }
-       return -ENXIO;
-}
diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h
deleted file mode 100644 (file)
index cc94ccc..0000000
+++ /dev/null
@@ -1,379 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_KVM_H
-
-#include <kvm/arm_arch_timer.h>
-#include <linux/tracepoint.h>
-#include <asm/kvm_arm.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvm
-
-/*
- * Tracepoints for entry/exit to guest
- */
-TRACE_EVENT(kvm_entry,
-       TP_PROTO(unsigned long vcpu_pc),
-       TP_ARGS(vcpu_pc),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  vcpu_pc         )
-       ),
-
-       TP_fast_assign(
-               __entry->vcpu_pc                = vcpu_pc;
-       ),
-
-       TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_exit,
-       TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc),
-       TP_ARGS(ret, esr_ec, vcpu_pc),
-
-       TP_STRUCT__entry(
-               __field(        int,            ret             )
-               __field(        unsigned int,   esr_ec          )
-               __field(        unsigned long,  vcpu_pc         )
-       ),
-
-       TP_fast_assign(
-               __entry->ret                    = ARM_EXCEPTION_CODE(ret);
-               __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0;
-               __entry->vcpu_pc                = vcpu_pc;
-       ),
-
-       TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
-                 __print_symbolic(__entry->ret, kvm_arm_exception_type),
-                 __entry->esr_ec,
-                 __print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
-                 __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_guest_fault,
-       TP_PROTO(unsigned long vcpu_pc, unsigned long hsr,
-                unsigned long hxfar,
-                unsigned long long ipa),
-       TP_ARGS(vcpu_pc, hsr, hxfar, ipa),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  vcpu_pc         )
-               __field(        unsigned long,  hsr             )
-               __field(        unsigned long,  hxfar           )
-               __field(   unsigned long long,  ipa             )
-       ),
-
-       TP_fast_assign(
-               __entry->vcpu_pc                = vcpu_pc;
-               __entry->hsr                    = hsr;
-               __entry->hxfar                  = hxfar;
-               __entry->ipa                    = ipa;
-       ),
-
-       TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
-                 __entry->ipa, __entry->hsr,
-                 __entry->hxfar, __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_access_fault,
-       TP_PROTO(unsigned long ipa),
-       TP_ARGS(ipa),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  ipa             )
-       ),
-
-       TP_fast_assign(
-               __entry->ipa            = ipa;
-       ),
-
-       TP_printk("IPA: %lx", __entry->ipa)
-);
-
-TRACE_EVENT(kvm_irq_line,
-       TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level),
-       TP_ARGS(type, vcpu_idx, irq_num, level),
-
-       TP_STRUCT__entry(
-               __field(        unsigned int,   type            )
-               __field(        int,            vcpu_idx        )
-               __field(        int,            irq_num         )
-               __field(        int,            level           )
-       ),
-
-       TP_fast_assign(
-               __entry->type           = type;
-               __entry->vcpu_idx       = vcpu_idx;
-               __entry->irq_num        = irq_num;
-               __entry->level          = level;
-       ),
-
-       TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d",
-                 (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" :
-                 (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" :
-                 (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN",
-                 __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level)
-);
-
-TRACE_EVENT(kvm_mmio_emulate,
-       TP_PROTO(unsigned long vcpu_pc, unsigned long instr,
-                unsigned long cpsr),
-       TP_ARGS(vcpu_pc, instr, cpsr),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  vcpu_pc         )
-               __field(        unsigned long,  instr           )
-               __field(        unsigned long,  cpsr            )
-       ),
-
-       TP_fast_assign(
-               __entry->vcpu_pc                = vcpu_pc;
-               __entry->instr                  = instr;
-               __entry->cpsr                   = cpsr;
-       ),
-
-       TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)",
-                 __entry->vcpu_pc, __entry->instr, __entry->cpsr)
-);
-
-TRACE_EVENT(kvm_unmap_hva_range,
-       TP_PROTO(unsigned long start, unsigned long end),
-       TP_ARGS(start, end),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  start           )
-               __field(        unsigned long,  end             )
-       ),
-
-       TP_fast_assign(
-               __entry->start          = start;
-               __entry->end            = end;
-       ),
-
-       TP_printk("mmu notifier unmap range: %#08lx -- %#08lx",
-                 __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_set_spte_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_age_hva,
-       TP_PROTO(unsigned long start, unsigned long end),
-       TP_ARGS(start, end),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  start           )
-               __field(        unsigned long,  end             )
-       ),
-
-       TP_fast_assign(
-               __entry->start          = start;
-               __entry->end            = end;
-       ),
-
-       TP_printk("mmu notifier age hva: %#08lx -- %#08lx",
-                 __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_test_age_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("mmu notifier test age hva: %#08lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_set_way_flush,
-           TP_PROTO(unsigned long vcpu_pc, bool cache),
-           TP_ARGS(vcpu_pc, cache),
-
-           TP_STRUCT__entry(
-                   __field(    unsigned long,  vcpu_pc         )
-                   __field(    bool,           cache           )
-           ),
-
-           TP_fast_assign(
-                   __entry->vcpu_pc            = vcpu_pc;
-                   __entry->cache              = cache;
-           ),
-
-           TP_printk("S/W flush at 0x%016lx (cache %s)",
-                     __entry->vcpu_pc, __entry->cache ? "on" : "off")
-);
-
-TRACE_EVENT(kvm_toggle_cache,
-           TP_PROTO(unsigned long vcpu_pc, bool was, bool now),
-           TP_ARGS(vcpu_pc, was, now),
-
-           TP_STRUCT__entry(
-                   __field(    unsigned long,  vcpu_pc         )
-                   __field(    bool,           was             )
-                   __field(    bool,           now             )
-           ),
-
-           TP_fast_assign(
-                   __entry->vcpu_pc            = vcpu_pc;
-                   __entry->was                = was;
-                   __entry->now                = now;
-           ),
-
-           TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
-                     __entry->vcpu_pc, __entry->was ? "on" : "off",
-                     __entry->now ? "on" : "off")
-);
-
-/*
- * Tracepoints for arch_timer
- */
-TRACE_EVENT(kvm_timer_update_irq,
-       TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
-       TP_ARGS(vcpu_id, irq, level),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  vcpu_id )
-               __field(        __u32,          irq     )
-               __field(        int,            level   )
-       ),
-
-       TP_fast_assign(
-               __entry->vcpu_id        = vcpu_id;
-               __entry->irq            = irq;
-               __entry->level          = level;
-       ),
-
-       TP_printk("VCPU: %ld, IRQ %d, level %d",
-                 __entry->vcpu_id, __entry->irq, __entry->level)
-);
-
-TRACE_EVENT(kvm_get_timer_map,
-       TP_PROTO(unsigned long vcpu_id, struct timer_map *map),
-       TP_ARGS(vcpu_id, map),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,          vcpu_id )
-               __field(        int,                    direct_vtimer   )
-               __field(        int,                    direct_ptimer   )
-               __field(        int,                    emul_ptimer     )
-       ),
-
-       TP_fast_assign(
-               __entry->vcpu_id                = vcpu_id;
-               __entry->direct_vtimer          = arch_timer_ctx_index(map->direct_vtimer);
-               __entry->direct_ptimer =
-                       (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
-               __entry->emul_ptimer =
-                       (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
-       ),
-
-       TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d",
-                 __entry->vcpu_id,
-                 __entry->direct_vtimer,
-                 __entry->direct_ptimer,
-                 __entry->emul_ptimer)
-);
-
-TRACE_EVENT(kvm_timer_save_state,
-       TP_PROTO(struct arch_timer_context *ctx),
-       TP_ARGS(ctx),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,          ctl             )
-               __field(        unsigned long long,     cval            )
-               __field(        int,                    timer_idx       )
-       ),
-
-       TP_fast_assign(
-               __entry->ctl                    = ctx->cnt_ctl;
-               __entry->cval                   = ctx->cnt_cval;
-               __entry->timer_idx              = arch_timer_ctx_index(ctx);
-       ),
-
-       TP_printk("   CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
-                 __entry->ctl,
-                 __entry->cval,
-                 __entry->timer_idx)
-);
-
-TRACE_EVENT(kvm_timer_restore_state,
-       TP_PROTO(struct arch_timer_context *ctx),
-       TP_ARGS(ctx),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,          ctl             )
-               __field(        unsigned long long,     cval            )
-               __field(        int,                    timer_idx       )
-       ),
-
-       TP_fast_assign(
-               __entry->ctl                    = ctx->cnt_ctl;
-               __entry->cval                   = ctx->cnt_cval;
-               __entry->timer_idx              = arch_timer_ctx_index(ctx);
-       ),
-
-       TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
-                 __entry->ctl,
-                 __entry->cval,
-                 __entry->timer_idx)
-);
-
-TRACE_EVENT(kvm_timer_hrtimer_expire,
-       TP_PROTO(struct arch_timer_context *ctx),
-       TP_ARGS(ctx),
-
-       TP_STRUCT__entry(
-               __field(        int,                    timer_idx       )
-       ),
-
-       TP_fast_assign(
-               __entry->timer_idx              = arch_timer_ctx_index(ctx);
-       ),
-
-       TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx)
-);
-
-TRACE_EVENT(kvm_timer_emulate,
-       TP_PROTO(struct arch_timer_context *ctx, bool should_fire),
-       TP_ARGS(ctx, should_fire),
-
-       TP_STRUCT__entry(
-               __field(        int,                    timer_idx       )
-               __field(        bool,                   should_fire     )
-       ),
-
-       TP_fast_assign(
-               __entry->timer_idx              = arch_timer_ctx_index(ctx);
-               __entry->should_fire            = should_fire;
-       ),
-
-       TP_printk("arch_timer_ctx_index: %d (should_fire: %d)",
-                 __entry->timer_idx, __entry->should_fire)
-);
-
-#endif /* _TRACE_KVM_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../virt/kvm/arm
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h
deleted file mode 100644 (file)
index 4fd4f6d..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_VGIC_H
-
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvm
-
-TRACE_EVENT(vgic_update_irq_pending,
-       TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
-       TP_ARGS(vcpu_id, irq, level),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  vcpu_id )
-               __field(        __u32,          irq     )
-               __field(        bool,           level   )
-       ),
-
-       TP_fast_assign(
-               __entry->vcpu_id        = vcpu_id;
-               __entry->irq            = irq;
-               __entry->level          = level;
-       ),
-
-       TP_printk("VCPU: %ld, IRQ %d, level: %d",
-                 __entry->vcpu_id, __entry->irq, __entry->level)
-);
-
-#endif /* _TRACE_VGIC_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../virt/kvm/arm/vgic
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
deleted file mode 100644 (file)
index b13a9e3..0000000
+++ /dev/null
@@ -1,300 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2016 Linaro
- * Author: Christoffer Dall <christoffer.dall@linaro.org>
- */
-
-#include <linux/cpu.h>
-#include <linux/debugfs.h>
-#include <linux/interrupt.h>
-#include <linux/kvm_host.h>
-#include <linux/seq_file.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_mmu.h>
-#include "vgic.h"
-
-/*
- * Structure to control looping through the entire vgic state.  We start at
- * zero for each field and move upwards.  So, if dist_id is 0 we print the
- * distributor info.  When dist_id is 1, we have already printed it and move
- * on.
- *
- * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and
- * so on.
- */
-struct vgic_state_iter {
-       int nr_cpus;
-       int nr_spis;
-       int nr_lpis;
-       int dist_id;
-       int vcpu_id;
-       int intid;
-       int lpi_idx;
-       u32 *lpi_array;
-};
-
-static void iter_next(struct vgic_state_iter *iter)
-{
-       if (iter->dist_id == 0) {
-               iter->dist_id++;
-               return;
-       }
-
-       iter->intid++;
-       if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
-           ++iter->vcpu_id < iter->nr_cpus)
-               iter->intid = 0;
-
-       if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) {
-               if (iter->lpi_idx < iter->nr_lpis)
-                       iter->intid = iter->lpi_array[iter->lpi_idx];
-               iter->lpi_idx++;
-       }
-}
-
-static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
-                     loff_t pos)
-{
-       int nr_cpus = atomic_read(&kvm->online_vcpus);
-
-       memset(iter, 0, sizeof(*iter));
-
-       iter->nr_cpus = nr_cpus;
-       iter->nr_spis = kvm->arch.vgic.nr_spis;
-       if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-               iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array);
-               if (iter->nr_lpis < 0)
-                       iter->nr_lpis = 0;
-       }
-
-       /* Fast forward to the right position if needed */
-       while (pos--)
-               iter_next(iter);
-}
-
-static bool end_of_vgic(struct vgic_state_iter *iter)
-{
-       return iter->dist_id > 0 &&
-               iter->vcpu_id == iter->nr_cpus &&
-               iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) &&
-               iter->lpi_idx > iter->nr_lpis;
-}
-
-static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
-{
-       struct kvm *kvm = (struct kvm *)s->private;
-       struct vgic_state_iter *iter;
-
-       mutex_lock(&kvm->lock);
-       iter = kvm->arch.vgic.iter;
-       if (iter) {
-               iter = ERR_PTR(-EBUSY);
-               goto out;
-       }
-
-       iter = kmalloc(sizeof(*iter), GFP_KERNEL);
-       if (!iter) {
-               iter = ERR_PTR(-ENOMEM);
-               goto out;
-       }
-
-       iter_init(kvm, iter, *pos);
-       kvm->arch.vgic.iter = iter;
-
-       if (end_of_vgic(iter))
-               iter = NULL;
-out:
-       mutex_unlock(&kvm->lock);
-       return iter;
-}
-
-static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos)
-{
-       struct kvm *kvm = (struct kvm *)s->private;
-       struct vgic_state_iter *iter = kvm->arch.vgic.iter;
-
-       ++*pos;
-       iter_next(iter);
-       if (end_of_vgic(iter))
-               iter = NULL;
-       return iter;
-}
-
-static void vgic_debug_stop(struct seq_file *s, void *v)
-{
-       struct kvm *kvm = (struct kvm *)s->private;
-       struct vgic_state_iter *iter;
-
-       /*
-        * If the seq file wasn't properly opened, there's nothing to clearn
-        * up.
-        */
-       if (IS_ERR(v))
-               return;
-
-       mutex_lock(&kvm->lock);
-       iter = kvm->arch.vgic.iter;
-       kfree(iter->lpi_array);
-       kfree(iter);
-       kvm->arch.vgic.iter = NULL;
-       mutex_unlock(&kvm->lock);
-}
-
-static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
-{
-       bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3;
-
-       seq_printf(s, "Distributor\n");
-       seq_printf(s, "===========\n");
-       seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2");
-       seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
-       if (v3)
-               seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count);
-       seq_printf(s, "enabled:\t%d\n", dist->enabled);
-       seq_printf(s, "\n");
-
-       seq_printf(s, "P=pending_latch, L=line_level, A=active\n");
-       seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n");
-       seq_printf(s, "G=group\n");
-}
-
-static void print_header(struct seq_file *s, struct vgic_irq *irq,
-                        struct kvm_vcpu *vcpu)
-{
-       int id = 0;
-       char *hdr = "SPI ";
-
-       if (vcpu) {
-               hdr = "VCPU";
-               id = vcpu->vcpu_id;
-       }
-
-       seq_printf(s, "\n");
-       seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHCG     HWID   TARGET SRC PRI VCPU_ID\n", hdr, id);
-       seq_printf(s, "----------------------------------------------------------------\n");
-}
-
-static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
-                           struct kvm_vcpu *vcpu)
-{
-       char *type;
-       bool pending;
-
-       if (irq->intid < VGIC_NR_SGIS)
-               type = "SGI";
-       else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
-               type = "PPI";
-       else if (irq->intid < VGIC_MAX_SPI)
-               type = "SPI";
-       else
-               type = "LPI";
-
-       if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS)
-               print_header(s, irq, vcpu);
-
-       pending = irq->pending_latch;
-       if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-               int err;
-
-               err = irq_get_irqchip_state(irq->host_irq,
-                                           IRQCHIP_STATE_PENDING,
-                                           &pending);
-               WARN_ON_ONCE(err);
-       }
-
-       seq_printf(s, "       %s %4d "
-                     "    %2d "
-                     "%d%d%d%d%d%d%d "
-                     "%8d "
-                     "%8x "
-                     " %2x "
-                     "%3d "
-                     "     %2d "
-                     "\n",
-                       type, irq->intid,
-                       (irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1,
-                       pending,
-                       irq->line_level,
-                       irq->active,
-                       irq->enabled,
-                       irq->hw,
-                       irq->config == VGIC_CONFIG_LEVEL,
-                       irq->group,
-                       irq->hwintid,
-                       irq->mpidr,
-                       irq->source,
-                       irq->priority,
-                       (irq->vcpu) ? irq->vcpu->vcpu_id : -1);
-}
-
-static int vgic_debug_show(struct seq_file *s, void *v)
-{
-       struct kvm *kvm = (struct kvm *)s->private;
-       struct vgic_state_iter *iter = (struct vgic_state_iter *)v;
-       struct vgic_irq *irq;
-       struct kvm_vcpu *vcpu = NULL;
-       unsigned long flags;
-
-       if (iter->dist_id == 0) {
-               print_dist_state(s, &kvm->arch.vgic);
-               return 0;
-       }
-
-       if (!kvm->arch.vgic.initialized)
-               return 0;
-
-       if (iter->vcpu_id < iter->nr_cpus)
-               vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);
-
-       irq = vgic_get_irq(kvm, vcpu, iter->intid);
-       if (!irq) {
-               seq_printf(s, "       LPI %4d freed\n", iter->intid);
-               return 0;
-       }
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-       print_irq_state(s, irq, vcpu);
-       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-       vgic_put_irq(kvm, irq);
-       return 0;
-}
-
-static const struct seq_operations vgic_debug_seq_ops = {
-       .start = vgic_debug_start,
-       .next  = vgic_debug_next,
-       .stop  = vgic_debug_stop,
-       .show  = vgic_debug_show
-};
-
-static int debug_open(struct inode *inode, struct file *file)
-{
-       int ret;
-       ret = seq_open(file, &vgic_debug_seq_ops);
-       if (!ret) {
-               struct seq_file *seq;
-               /* seq_open will have modified file->private_data */
-               seq = file->private_data;
-               seq->private = inode->i_private;
-       }
-
-       return ret;
-};
-
-static const struct file_operations vgic_debug_fops = {
-       .owner   = THIS_MODULE,
-       .open    = debug_open,
-       .read    = seq_read,
-       .llseek  = seq_lseek,
-       .release = seq_release
-};
-
-void vgic_debug_init(struct kvm *kvm)
-{
-       debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm,
-                           &vgic_debug_fops);
-}
-
-void vgic_debug_destroy(struct kvm *kvm)
-{
-}
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
deleted file mode 100644 (file)
index 32e32d6..0000000
+++ /dev/null
@@ -1,556 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-
-#include <linux/uaccess.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_mmu.h>
-#include "vgic.h"
-
-/*
- * Initialization rules: there are multiple stages to the vgic
- * initialization, both for the distributor and the CPU interfaces.  The basic
- * idea is that even though the VGIC is not functional or not requested from
- * user space, the critical path of the run loop can still call VGIC functions
- * that just won't do anything, without them having to check additional
- * initialization flags to ensure they don't look at uninitialized data
- * structures.
- *
- * Distributor:
- *
- * - kvm_vgic_early_init(): initialization of static data that doesn't
- *   depend on any sizing information or emulation type. No allocation
- *   is allowed there.
- *
- * - vgic_init(): allocation and initialization of the generic data
- *   structures that depend on sizing information (number of CPUs,
- *   number of interrupts). Also initializes the vcpu specific data
- *   structures. Can be executed lazily for GICv2.
- *
- * CPU Interface:
- *
- * - kvm_vgic_vcpu_init(): initialization of static data that
- *   doesn't depend on any sizing information or emulation type. No
- *   allocation is allowed there.
- */
-
-/* EARLY INIT */
-
-/**
- * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures
- * @kvm: The VM whose VGIC districutor should be initialized
- *
- * Only do initialization of static structures that don't require any
- * allocation or sizing information from userspace.  vgic_init() called
- * kvm_vgic_dist_init() which takes care of the rest.
- */
-void kvm_vgic_early_init(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       INIT_LIST_HEAD(&dist->lpi_list_head);
-       INIT_LIST_HEAD(&dist->lpi_translation_cache);
-       raw_spin_lock_init(&dist->lpi_list_lock);
-}
-
-/* CREATION */
-
-/**
- * kvm_vgic_create: triggered by the instantiation of the VGIC device by
- * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
- * or through the generic KVM_CREATE_DEVICE API ioctl.
- * irqchip_in_kernel() tells you if this function succeeded or not.
- * @kvm: kvm struct pointer
- * @type: KVM_DEV_TYPE_ARM_VGIC_V[23]
- */
-int kvm_vgic_create(struct kvm *kvm, u32 type)
-{
-       int i, ret;
-       struct kvm_vcpu *vcpu;
-
-       if (irqchip_in_kernel(kvm))
-               return -EEXIST;
-
-       /*
-        * This function is also called by the KVM_CREATE_IRQCHIP handler,
-        * which had no chance yet to check the availability of the GICv2
-        * emulation. So check this here again. KVM_CREATE_DEVICE does
-        * the proper checks already.
-        */
-       if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
-               !kvm_vgic_global_state.can_emulate_gicv2)
-               return -ENODEV;
-
-       ret = -EBUSY;
-       if (!lock_all_vcpus(kvm))
-               return ret;
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (vcpu->arch.has_run_once)
-                       goto out_unlock;
-       }
-       ret = 0;
-
-       if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
-               kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
-       else
-               kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
-
-       if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
-               ret = -E2BIG;
-               goto out_unlock;
-       }
-
-       kvm->arch.vgic.in_kernel = true;
-       kvm->arch.vgic.vgic_model = type;
-
-       kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
-
-       if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
-               kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
-       else
-               INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
-
-out_unlock:
-       unlock_all_vcpus(kvm);
-       return ret;
-}
-
-/* INIT/DESTROY */
-
-/**
- * kvm_vgic_dist_init: initialize the dist data structures
- * @kvm: kvm struct pointer
- * @nr_spis: number of spis, frozen by caller
- */
-static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
-       int i;
-
-       dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
-       if (!dist->spis)
-               return  -ENOMEM;
-
-       /*
-        * In the following code we do not take the irq struct lock since
-        * no other action on irq structs can happen while the VGIC is
-        * not initialized yet:
-        * If someone wants to inject an interrupt or does a MMIO access, we
-        * require prior initialization in case of a virtual GICv3 or trigger
-        * initialization when using a virtual GICv2.
-        */
-       for (i = 0; i < nr_spis; i++) {
-               struct vgic_irq *irq = &dist->spis[i];
-
-               irq->intid = i + VGIC_NR_PRIVATE_IRQS;
-               INIT_LIST_HEAD(&irq->ap_list);
-               raw_spin_lock_init(&irq->irq_lock);
-               irq->vcpu = NULL;
-               irq->target_vcpu = vcpu0;
-               kref_init(&irq->refcount);
-               switch (dist->vgic_model) {
-               case KVM_DEV_TYPE_ARM_VGIC_V2:
-                       irq->targets = 0;
-                       irq->group = 0;
-                       break;
-               case KVM_DEV_TYPE_ARM_VGIC_V3:
-                       irq->mpidr = 0;
-                       irq->group = 1;
-                       break;
-               default:
-                       kfree(dist->spis);
-                       dist->spis = NULL;
-                       return -EINVAL;
-               }
-       }
-       return 0;
-}
-
-/**
- * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
- * structures and register VCPU-specific KVM iodevs
- *
- * @vcpu: pointer to the VCPU being created and initialized
- *
- * Only do initialization, but do not actually enable the
- * VGIC CPU interface
- */
-int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       int ret = 0;
-       int i;
-
-       vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
-
-       INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
-       raw_spin_lock_init(&vgic_cpu->ap_list_lock);
-       atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
-
-       /*
-        * Enable and configure all SGIs to be edge-triggered and
-        * configure all PPIs as level-triggered.
-        */
-       for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
-               struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
-
-               INIT_LIST_HEAD(&irq->ap_list);
-               raw_spin_lock_init(&irq->irq_lock);
-               irq->intid = i;
-               irq->vcpu = NULL;
-               irq->target_vcpu = vcpu;
-               kref_init(&irq->refcount);
-               if (vgic_irq_is_sgi(i)) {
-                       /* SGIs */
-                       irq->enabled = 1;
-                       irq->config = VGIC_CONFIG_EDGE;
-               } else {
-                       /* PPIs */
-                       irq->config = VGIC_CONFIG_LEVEL;
-               }
-       }
-
-       if (!irqchip_in_kernel(vcpu->kvm))
-               return 0;
-
-       /*
-        * If we are creating a VCPU with a GICv3 we must also register the
-        * KVM io device for the redistributor that belongs to this VCPU.
-        */
-       if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-               mutex_lock(&vcpu->kvm->lock);
-               ret = vgic_register_redist_iodev(vcpu);
-               mutex_unlock(&vcpu->kvm->lock);
-       }
-       return ret;
-}
-
-static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
-{
-       if (kvm_vgic_global_state.type == VGIC_V2)
-               vgic_v2_enable(vcpu);
-       else
-               vgic_v3_enable(vcpu);
-}
-
-/*
- * vgic_init: allocates and initializes dist and vcpu data structures
- * depending on two dimensioning parameters:
- * - the number of spis
- * - the number of vcpus
- * The function is generally called when nr_spis has been explicitly set
- * by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
- * vgic_initialized() returns true when this function has succeeded.
- * Must be called with kvm->lock held!
- */
-int vgic_init(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int ret = 0, i, idx;
-
-       if (vgic_initialized(kvm))
-               return 0;
-
-       /* Are we also in the middle of creating a VCPU? */
-       if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
-               return -EBUSY;
-
-       /* freeze the number of spis */
-       if (!dist->nr_spis)
-               dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;
-
-       ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
-       if (ret)
-               goto out;
-
-       /* Initialize groups on CPUs created before the VGIC type was known */
-       kvm_for_each_vcpu(idx, vcpu, kvm) {
-               struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-               for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
-                       struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
-                       switch (dist->vgic_model) {
-                       case KVM_DEV_TYPE_ARM_VGIC_V3:
-                               irq->group = 1;
-                               irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
-                               break;
-                       case KVM_DEV_TYPE_ARM_VGIC_V2:
-                               irq->group = 0;
-                               irq->targets = 1U << idx;
-                               break;
-                       default:
-                               ret = -EINVAL;
-                               goto out;
-                       }
-               }
-       }
-
-       if (vgic_has_its(kvm))
-               vgic_lpi_translation_cache_init(kvm);
-
-       /*
-        * If we have GICv4.1 enabled, unconditionnaly request enable the
-        * v4 support so that we get HW-accelerated vSGIs. Otherwise, only
-        * enable it if we present a virtual ITS to the guest.
-        */
-       if (vgic_supports_direct_msis(kvm)) {
-               ret = vgic_v4_init(kvm);
-               if (ret)
-                       goto out;
-       }
-
-       kvm_for_each_vcpu(i, vcpu, kvm)
-               kvm_vgic_vcpu_enable(vcpu);
-
-       ret = kvm_vgic_setup_default_irq_routing(kvm);
-       if (ret)
-               goto out;
-
-       vgic_debug_init(kvm);
-
-       dist->implementation_rev = 2;
-       dist->initialized = true;
-
-out:
-       return ret;
-}
-
-static void kvm_vgic_dist_destroy(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct vgic_redist_region *rdreg, *next;
-
-       dist->ready = false;
-       dist->initialized = false;
-
-       kfree(dist->spis);
-       dist->spis = NULL;
-       dist->nr_spis = 0;
-
-       if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-               list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
-                       list_del(&rdreg->list);
-                       kfree(rdreg);
-               }
-               INIT_LIST_HEAD(&dist->rd_regions);
-       }
-
-       if (vgic_has_its(kvm))
-               vgic_lpi_translation_cache_destroy(kvm);
-
-       if (vgic_supports_direct_msis(kvm))
-               vgic_v4_teardown(kvm);
-}
-
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-       /*
-        * Retire all pending LPIs on this vcpu anyway as we're
-        * going to destroy it.
-        */
-       vgic_flush_pending_lpis(vcpu);
-
-       INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
-}
-
-/* To be called with kvm->lock held */
-static void __kvm_vgic_destroy(struct kvm *kvm)
-{
-       struct kvm_vcpu *vcpu;
-       int i;
-
-       vgic_debug_destroy(kvm);
-
-       kvm_for_each_vcpu(i, vcpu, kvm)
-               kvm_vgic_vcpu_destroy(vcpu);
-
-       kvm_vgic_dist_destroy(kvm);
-}
-
-void kvm_vgic_destroy(struct kvm *kvm)
-{
-       mutex_lock(&kvm->lock);
-       __kvm_vgic_destroy(kvm);
-       mutex_unlock(&kvm->lock);
-}
-
-/**
- * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest
- * is a GICv2. A GICv3 must be explicitly initialized by the guest using the
- * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group.
- * @kvm: kvm struct pointer
- */
-int vgic_lazy_init(struct kvm *kvm)
-{
-       int ret = 0;
-
-       if (unlikely(!vgic_initialized(kvm))) {
-               /*
-                * We only provide the automatic initialization of the VGIC
-                * for the legacy case of a GICv2. Any other type must
-                * be explicitly initialized once setup with the respective
-                * KVM device call.
-                */
-               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
-                       return -EBUSY;
-
-               mutex_lock(&kvm->lock);
-               ret = vgic_init(kvm);
-               mutex_unlock(&kvm->lock);
-       }
-
-       return ret;
-}
-
-/* RESOURCE MAPPING */
-
-/**
- * Map the MMIO regions depending on the VGIC model exposed to the guest
- * called on the first VCPU run.
- * Also map the virtual CPU interface into the VM.
- * v2/v3 derivatives call vgic_init if not already done.
- * vgic_ready() returns true if this function has succeeded.
- * @kvm: kvm struct pointer
- */
-int kvm_vgic_map_resources(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int ret = 0;
-
-       mutex_lock(&kvm->lock);
-       if (!irqchip_in_kernel(kvm))
-               goto out;
-
-       if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
-               ret = vgic_v2_map_resources(kvm);
-       else
-               ret = vgic_v3_map_resources(kvm);
-
-       if (ret)
-               __kvm_vgic_destroy(kvm);
-
-out:
-       mutex_unlock(&kvm->lock);
-       return ret;
-}
-
-/* GENERIC PROBE */
-
-static int vgic_init_cpu_starting(unsigned int cpu)
-{
-       enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
-       return 0;
-}
-
-
-static int vgic_init_cpu_dying(unsigned int cpu)
-{
-       disable_percpu_irq(kvm_vgic_global_state.maint_irq);
-       return 0;
-}
-
-static irqreturn_t vgic_maintenance_handler(int irq, void *data)
-{
-       /*
-        * We cannot rely on the vgic maintenance interrupt to be
-        * delivered synchronously. This means we can only use it to
-        * exit the VM, and we perform the handling of EOIed
-        * interrupts on the exit path (see vgic_fold_lr_state).
-        */
-       return IRQ_HANDLED;
-}
-
-/**
- * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
- *
- * For a specific CPU, initialize the GIC VE hardware.
- */
-void kvm_vgic_init_cpu_hardware(void)
-{
-       BUG_ON(preemptible());
-
-       /*
-        * We want to make sure the list registers start out clear so that we
-        * only have the program the used registers.
-        */
-       if (kvm_vgic_global_state.type == VGIC_V2)
-               vgic_v2_init_lrs();
-       else
-               kvm_call_hyp(__vgic_v3_init_lrs);
-}
-
-/**
- * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable
- * according to the host GIC model. Accordingly calls either
- * vgic_v2/v3_probe which registers the KVM_DEVICE that can be
- * instantiated by a guest later on .
- */
-int kvm_vgic_hyp_init(void)
-{
-       const struct gic_kvm_info *gic_kvm_info;
-       int ret;
-
-       gic_kvm_info = gic_get_kvm_info();
-       if (!gic_kvm_info)
-               return -ENODEV;
-
-       if (!gic_kvm_info->maint_irq) {
-               kvm_err("No vgic maintenance irq\n");
-               return -ENXIO;
-       }
-
-       switch (gic_kvm_info->type) {
-       case GIC_V2:
-               ret = vgic_v2_probe(gic_kvm_info);
-               break;
-       case GIC_V3:
-               ret = vgic_v3_probe(gic_kvm_info);
-               if (!ret) {
-                       static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
-                       kvm_info("GIC system register CPU interface enabled\n");
-               }
-               break;
-       default:
-               ret = -ENODEV;
-       }
-
-       if (ret)
-               return ret;
-
-       kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
-       ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
-                                vgic_maintenance_handler,
-                                "vgic", kvm_get_running_vcpus());
-       if (ret) {
-               kvm_err("Cannot register interrupt %d\n",
-                       kvm_vgic_global_state.maint_irq);
-               return ret;
-       }
-
-       ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
-                               "kvm/arm/vgic:starting",
-                               vgic_init_cpu_starting, vgic_init_cpu_dying);
-       if (ret) {
-               kvm_err("Cannot register vgic CPU notifier\n");
-               goto out_free_irq;
-       }
-
-       kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
-       return 0;
-
-out_free_irq:
-       free_percpu_irq(kvm_vgic_global_state.maint_irq,
-                       kvm_get_running_vcpus());
-       return ret;
-}
diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c
deleted file mode 100644 (file)
index d8cdfea..0000000
+++ /dev/null
@@ -1,141 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <trace/events/kvm.h>
-#include <kvm/arm_vgic.h>
-#include "vgic.h"
-
-/**
- * vgic_irqfd_set_irq: inject the IRQ corresponding to the
- * irqchip routing entry
- *
- * This is the entry point for irqfd IRQ injection
- */
-static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
-                       struct kvm *kvm, int irq_source_id,
-                       int level, bool line_status)
-{
-       unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS;
-
-       if (!vgic_valid_spi(kvm, spi_id))
-               return -EINVAL;
-       return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL);
-}
-
-/**
- * kvm_set_routing_entry: populate a kvm routing entry
- * from a user routing entry
- *
- * @kvm: the VM this entry is applied to
- * @e: kvm kernel routing entry handle
- * @ue: user api routing entry handle
- * return 0 on success, -EINVAL on errors.
- */
-int kvm_set_routing_entry(struct kvm *kvm,
-                         struct kvm_kernel_irq_routing_entry *e,
-                         const struct kvm_irq_routing_entry *ue)
-{
-       int r = -EINVAL;
-
-       switch (ue->type) {
-       case KVM_IRQ_ROUTING_IRQCHIP:
-               e->set = vgic_irqfd_set_irq;
-               e->irqchip.irqchip = ue->u.irqchip.irqchip;
-               e->irqchip.pin = ue->u.irqchip.pin;
-               if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) ||
-                   (e->irqchip.irqchip >= KVM_NR_IRQCHIPS))
-                       goto out;
-               break;
-       case KVM_IRQ_ROUTING_MSI:
-               e->set = kvm_set_msi;
-               e->msi.address_lo = ue->u.msi.address_lo;
-               e->msi.address_hi = ue->u.msi.address_hi;
-               e->msi.data = ue->u.msi.data;
-               e->msi.flags = ue->flags;
-               e->msi.devid = ue->u.msi.devid;
-               break;
-       default:
-               goto out;
-       }
-       r = 0;
-out:
-       return r;
-}
-
-static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e,
-                            struct kvm_msi *msi)
-{
-       msi->address_lo = e->msi.address_lo;
-       msi->address_hi = e->msi.address_hi;
-       msi->data = e->msi.data;
-       msi->flags = e->msi.flags;
-       msi->devid = e->msi.devid;
-}
-/**
- * kvm_set_msi: inject the MSI corresponding to the
- * MSI routing entry
- *
- * This is the entry point for irqfd MSI injection
- * and userspace MSI injection.
- */
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-               struct kvm *kvm, int irq_source_id,
-               int level, bool line_status)
-{
-       struct kvm_msi msi;
-
-       if (!vgic_has_its(kvm))
-               return -ENODEV;
-
-       if (!level)
-               return -1;
-
-       kvm_populate_msi(e, &msi);
-       return vgic_its_inject_msi(kvm, &msi);
-}
-
-/**
- * kvm_arch_set_irq_inatomic: fast-path for irqfd injection
- *
- * Currently only direct MSI injection is supported.
- */
-int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
-                             struct kvm *kvm, int irq_source_id, int level,
-                             bool line_status)
-{
-       if (e->type == KVM_IRQ_ROUTING_MSI && vgic_has_its(kvm) && level) {
-               struct kvm_msi msi;
-
-               kvm_populate_msi(e, &msi);
-               if (!vgic_its_inject_cached_translation(kvm, &msi))
-                       return 0;
-       }
-
-       return -EWOULDBLOCK;
-}
-
-int kvm_vgic_setup_default_irq_routing(struct kvm *kvm)
-{
-       struct kvm_irq_routing_entry *entries;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       u32 nr = dist->nr_spis;
-       int i, ret;
-
-       entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL);
-       if (!entries)
-               return -ENOMEM;
-
-       for (i = 0; i < nr; i++) {
-               entries[i].gsi = i;
-               entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
-               entries[i].u.irqchip.irqchip = 0;
-               entries[i].u.irqchip.pin = i;
-       }
-       ret = kvm_set_irq_routing(kvm, entries, nr, 0);
-       kfree(entries);
-       return ret;
-}
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
deleted file mode 100644 (file)
index c012a52..0000000
+++ /dev/null
@@ -1,2783 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * GICv3 ITS emulation
- *
- * Copyright (C) 2015,2016 ARM Ltd.
- * Author: Andre Przywara <andre.przywara@arm.com>
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/list.h>
-#include <linux/uaccess.h>
-#include <linux/list_sort.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-static int vgic_its_save_tables_v0(struct vgic_its *its);
-static int vgic_its_restore_tables_v0(struct vgic_its *its);
-static int vgic_its_commit_v0(struct vgic_its *its);
-static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
-                            struct kvm_vcpu *filter_vcpu, bool needs_inv);
-
-/*
- * Creates a new (reference to a) struct vgic_irq for a given LPI.
- * If this LPI is already mapped on another ITS, we increase its refcount
- * and return a pointer to the existing structure.
- * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
- * This function returns a pointer to the _unlocked_ structure.
- */
-static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
-                                    struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
-       unsigned long flags;
-       int ret;
-
-       /* In this case there is no put, since we keep the reference. */
-       if (irq)
-               return irq;
-
-       irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL);
-       if (!irq)
-               return ERR_PTR(-ENOMEM);
-
-       INIT_LIST_HEAD(&irq->lpi_list);
-       INIT_LIST_HEAD(&irq->ap_list);
-       raw_spin_lock_init(&irq->irq_lock);
-
-       irq->config = VGIC_CONFIG_EDGE;
-       kref_init(&irq->refcount);
-       irq->intid = intid;
-       irq->target_vcpu = vcpu;
-       irq->group = 1;
-
-       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-       /*
-        * There could be a race with another vgic_add_lpi(), so we need to
-        * check that we don't add a second list entry with the same LPI.
-        */
-       list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
-               if (oldirq->intid != intid)
-                       continue;
-
-               /* Someone was faster with adding this LPI, lets use that. */
-               kfree(irq);
-               irq = oldirq;
-
-               /*
-                * This increases the refcount, the caller is expected to
-                * call vgic_put_irq() on the returned pointer once it's
-                * finished with the IRQ.
-                */
-               vgic_get_irq_kref(irq);
-
-               goto out_unlock;
-       }
-
-       list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
-       dist->lpi_list_count++;
-
-out_unlock:
-       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-       /*
-        * We "cache" the configuration table entries in our struct vgic_irq's.
-        * However we only have those structs for mapped IRQs, so we read in
-        * the respective config data from memory here upon mapping the LPI.
-        *
-        * Should any of these fail, behave as if we couldn't create the LPI
-        * by dropping the refcount and returning the error.
-        */
-       ret = update_lpi_config(kvm, irq, NULL, false);
-       if (ret) {
-               vgic_put_irq(kvm, irq);
-               return ERR_PTR(ret);
-       }
-
-       ret = vgic_v3_lpi_sync_pending_status(kvm, irq);
-       if (ret) {
-               vgic_put_irq(kvm, irq);
-               return ERR_PTR(ret);
-       }
-
-       return irq;
-}
-
-struct its_device {
-       struct list_head dev_list;
-
-       /* the head for the list of ITTEs */
-       struct list_head itt_head;
-       u32 num_eventid_bits;
-       gpa_t itt_addr;
-       u32 device_id;
-};
-
-#define COLLECTION_NOT_MAPPED ((u32)~0)
-
-struct its_collection {
-       struct list_head coll_list;
-
-       u32 collection_id;
-       u32 target_addr;
-};
-
-#define its_is_collection_mapped(coll) ((coll) && \
-                               ((coll)->target_addr != COLLECTION_NOT_MAPPED))
-
-struct its_ite {
-       struct list_head ite_list;
-
-       struct vgic_irq *irq;
-       struct its_collection *collection;
-       u32 event_id;
-};
-
-struct vgic_translation_cache_entry {
-       struct list_head        entry;
-       phys_addr_t             db;
-       u32                     devid;
-       u32                     eventid;
-       struct vgic_irq         *irq;
-};
-
-/**
- * struct vgic_its_abi - ITS abi ops and settings
- * @cte_esz: collection table entry size
- * @dte_esz: device table entry size
- * @ite_esz: interrupt translation table entry size
- * @save tables: save the ITS tables into guest RAM
- * @restore_tables: restore the ITS internal structs from tables
- *  stored in guest RAM
- * @commit: initialize the registers which expose the ABI settings,
- *  especially the entry sizes
- */
-struct vgic_its_abi {
-       int cte_esz;
-       int dte_esz;
-       int ite_esz;
-       int (*save_tables)(struct vgic_its *its);
-       int (*restore_tables)(struct vgic_its *its);
-       int (*commit)(struct vgic_its *its);
-};
-
-#define ABI_0_ESZ      8
-#define ESZ_MAX                ABI_0_ESZ
-
-static const struct vgic_its_abi its_table_abi_versions[] = {
-       [0] = {
-        .cte_esz = ABI_0_ESZ,
-        .dte_esz = ABI_0_ESZ,
-        .ite_esz = ABI_0_ESZ,
-        .save_tables = vgic_its_save_tables_v0,
-        .restore_tables = vgic_its_restore_tables_v0,
-        .commit = vgic_its_commit_v0,
-       },
-};
-
-#define NR_ITS_ABIS    ARRAY_SIZE(its_table_abi_versions)
-
-inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its)
-{
-       return &its_table_abi_versions[its->abi_rev];
-}
-
-static int vgic_its_set_abi(struct vgic_its *its, u32 rev)
-{
-       const struct vgic_its_abi *abi;
-
-       its->abi_rev = rev;
-       abi = vgic_its_get_abi(its);
-       return abi->commit(its);
-}
-
-/*
- * Find and returns a device in the device table for an ITS.
- * Must be called with the its_lock mutex held.
- */
-static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
-{
-       struct its_device *device;
-
-       list_for_each_entry(device, &its->device_list, dev_list)
-               if (device_id == device->device_id)
-                       return device;
-
-       return NULL;
-}
-
-/*
- * Find and returns an interrupt translation table entry (ITTE) for a given
- * Device ID/Event ID pair on an ITS.
- * Must be called with the its_lock mutex held.
- */
-static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
-                                 u32 event_id)
-{
-       struct its_device *device;
-       struct its_ite *ite;
-
-       device = find_its_device(its, device_id);
-       if (device == NULL)
-               return NULL;
-
-       list_for_each_entry(ite, &device->itt_head, ite_list)
-               if (ite->event_id == event_id)
-                       return ite;
-
-       return NULL;
-}
-
-/* To be used as an iterator this macro misses the enclosing parentheses */
-#define for_each_lpi_its(dev, ite, its) \
-       list_for_each_entry(dev, &(its)->device_list, dev_list) \
-               list_for_each_entry(ite, &(dev)->itt_head, ite_list)
-
-#define GIC_LPI_OFFSET 8192
-
-#define VITS_TYPER_IDBITS 16
-#define VITS_TYPER_DEVBITS 16
-#define VITS_DTE_MAX_DEVID_OFFSET      (BIT(14) - 1)
-#define VITS_ITE_MAX_EVENTID_OFFSET    (BIT(16) - 1)
-
-/*
- * Finds and returns a collection in the ITS collection table.
- * Must be called with the its_lock mutex held.
- */
-static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
-{
-       struct its_collection *collection;
-
-       list_for_each_entry(collection, &its->collection_list, coll_list) {
-               if (coll_id == collection->collection_id)
-                       return collection;
-       }
-
-       return NULL;
-}
-
-#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED)
-#define LPI_PROP_PRIORITY(p)   ((p) & 0xfc)
-
-/*
- * Reads the configuration data for a given LPI from guest memory and
- * updates the fields in struct vgic_irq.
- * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
- * VCPU. Unconditionally applies if filter_vcpu is NULL.
- */
-static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
-                            struct kvm_vcpu *filter_vcpu, bool needs_inv)
-{
-       u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
-       u8 prop;
-       int ret;
-       unsigned long flags;
-
-       ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
-                                 &prop, 1);
-
-       if (ret)
-               return ret;
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-       if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
-               irq->priority = LPI_PROP_PRIORITY(prop);
-               irq->enabled = LPI_PROP_ENABLE_BIT(prop);
-
-               if (!irq->hw) {
-                       vgic_queue_irq_unlock(kvm, irq, flags);
-                       return 0;
-               }
-       }
-
-       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-       if (irq->hw)
-               return its_prop_update_vlpi(irq->host_irq, prop, needs_inv);
-
-       return 0;
-}
-
-/*
- * Create a snapshot of the current LPIs targeting @vcpu, so that we can
- * enumerate those LPIs without holding any lock.
- * Returns their number and puts the kmalloc'ed array into intid_ptr.
- */
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct vgic_irq *irq;
-       unsigned long flags;
-       u32 *intids;
-       int irq_count, i = 0;
-
-       /*
-        * There is an obvious race between allocating the array and LPIs
-        * being mapped/unmapped. If we ended up here as a result of a
-        * command, we're safe (locks are held, preventing another
-        * command). If coming from another path (such as enabling LPIs),
-        * we must be careful not to overrun the array.
-        */
-       irq_count = READ_ONCE(dist->lpi_list_count);
-       intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
-       if (!intids)
-               return -ENOMEM;
-
-       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
-               if (i == irq_count)
-                       break;
-               /* We don't need to "get" the IRQ, as we hold the list lock. */
-               if (vcpu && irq->target_vcpu != vcpu)
-                       continue;
-               intids[i++] = irq->intid;
-       }
-       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-       *intid_ptr = intids;
-       return i;
-}
-
-static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
-{
-       int ret = 0;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-       irq->target_vcpu = vcpu;
-       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-       if (irq->hw) {
-               struct its_vlpi_map map;
-
-               ret = its_get_vlpi(irq->host_irq, &map);
-               if (ret)
-                       return ret;
-
-               if (map.vpe)
-                       atomic_dec(&map.vpe->vlpi_count);
-               map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-               atomic_inc(&map.vpe->vlpi_count);
-
-               ret = its_map_vlpi(irq->host_irq, &map);
-       }
-
-       return ret;
-}
-
-/*
- * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
- * is targeting) to the VGIC's view, which deals with target VCPUs.
- * Needs to be called whenever either the collection for a LPIs has
- * changed or the collection itself got retargeted.
- */
-static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite)
-{
-       struct kvm_vcpu *vcpu;
-
-       if (!its_is_collection_mapped(ite->collection))
-               return;
-
-       vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
-       update_affinity(ite->irq, vcpu);
-}
-
-/*
- * Updates the target VCPU for every LPI targeting this collection.
- * Must be called with the its_lock mutex held.
- */
-static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
-                                      struct its_collection *coll)
-{
-       struct its_device *device;
-       struct its_ite *ite;
-
-       for_each_lpi_its(device, ite, its) {
-               if (!ite->collection || coll != ite->collection)
-                       continue;
-
-               update_affinity_ite(kvm, ite);
-       }
-}
-
-static u32 max_lpis_propbaser(u64 propbaser)
-{
-       int nr_idbits = (propbaser & 0x1f) + 1;
-
-       return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
-}
-
-/*
- * Sync the pending table pending bit of LPIs targeting @vcpu
- * with our own data structures. This relies on the LPI being
- * mapped before.
- */
-static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
-{
-       gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
-       struct vgic_irq *irq;
-       int last_byte_offset = -1;
-       int ret = 0;
-       u32 *intids;
-       int nr_irqs, i;
-       unsigned long flags;
-       u8 pendmask;
-
-       nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids);
-       if (nr_irqs < 0)
-               return nr_irqs;
-
-       for (i = 0; i < nr_irqs; i++) {
-               int byte_offset, bit_nr;
-
-               byte_offset = intids[i] / BITS_PER_BYTE;
-               bit_nr = intids[i] % BITS_PER_BYTE;
-
-               /*
-                * For contiguously allocated LPIs chances are we just read
-                * this very same byte in the last iteration. Reuse that.
-                */
-               if (byte_offset != last_byte_offset) {
-                       ret = kvm_read_guest_lock(vcpu->kvm,
-                                                 pendbase + byte_offset,
-                                                 &pendmask, 1);
-                       if (ret) {
-                               kfree(intids);
-                               return ret;
-                       }
-                       last_byte_offset = byte_offset;
-               }
-
-               irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               irq->pending_latch = pendmask & (1U << bit_nr);
-               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       kfree(intids);
-
-       return ret;
-}
-
-static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
-                                             struct vgic_its *its,
-                                             gpa_t addr, unsigned int len)
-{
-       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-       u64 reg = GITS_TYPER_PLPIS;
-
-       /*
-        * We use linear CPU numbers for redistributor addressing,
-        * so GITS_TYPER.PTA is 0.
-        * Also we force all PROPBASER registers to be the same, so
-        * CommonLPIAff is 0 as well.
-        * To avoid memory waste in the guest, we keep the number of IDBits and
-        * DevBits low - as least for the time being.
-        */
-       reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT;
-       reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT;
-       reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT;
-
-       return extract_bytes(reg, addr & 7, len);
-}
-
-static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
-                                            struct vgic_its *its,
-                                            gpa_t addr, unsigned int len)
-{
-       u32 val;
-
-       val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK;
-       val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM;
-       return val;
-}
-
-static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm,
-                                           struct vgic_its *its,
-                                           gpa_t addr, unsigned int len,
-                                           unsigned long val)
-{
-       u32 rev = GITS_IIDR_REV(val);
-
-       if (rev >= NR_ITS_ABIS)
-               return -EINVAL;
-       return vgic_its_set_abi(its, rev);
-}
-
-static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
-                                              struct vgic_its *its,
-                                              gpa_t addr, unsigned int len)
-{
-       switch (addr & 0xffff) {
-       case GITS_PIDR0:
-               return 0x92;    /* part number, bits[7:0] */
-       case GITS_PIDR1:
-               return 0xb4;    /* part number, bits[11:8] */
-       case GITS_PIDR2:
-               return GIC_PIDR2_ARCH_GICv3 | 0x0b;
-       case GITS_PIDR4:
-               return 0x40;    /* This is a 64K software visible page */
-       /* The following are the ID registers for (any) GIC. */
-       case GITS_CIDR0:
-               return 0x0d;
-       case GITS_CIDR1:
-               return 0xf0;
-       case GITS_CIDR2:
-               return 0x05;
-       case GITS_CIDR3:
-               return 0xb1;
-       }
-
-       return 0;
-}
-
-static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
-                                              phys_addr_t db,
-                                              u32 devid, u32 eventid)
-{
-       struct vgic_translation_cache_entry *cte;
-
-       list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
-               /*
-                * If we hit a NULL entry, there is nothing after this
-                * point.
-                */
-               if (!cte->irq)
-                       break;
-
-               if (cte->db != db || cte->devid != devid ||
-                   cte->eventid != eventid)
-                       continue;
-
-               /*
-                * Move this entry to the head, as it is the most
-                * recently used.
-                */
-               if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
-                       list_move(&cte->entry, &dist->lpi_translation_cache);
-
-               return cte->irq;
-       }
-
-       return NULL;
-}
-
-static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
-                                            u32 devid, u32 eventid)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct vgic_irq *irq;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-       irq = __vgic_its_check_cache(dist, db, devid, eventid);
-       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-       return irq;
-}
-
-static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
-                                      u32 devid, u32 eventid,
-                                      struct vgic_irq *irq)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct vgic_translation_cache_entry *cte;
-       unsigned long flags;
-       phys_addr_t db;
-
-       /* Do not cache a directly injected interrupt */
-       if (irq->hw)
-               return;
-
-       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-       if (unlikely(list_empty(&dist->lpi_translation_cache)))
-               goto out;
-
-       /*
-        * We could have raced with another CPU caching the same
-        * translation behind our back, so let's check it is not in
-        * already
-        */
-       db = its->vgic_its_base + GITS_TRANSLATER;
-       if (__vgic_its_check_cache(dist, db, devid, eventid))
-               goto out;
-
-       /* Always reuse the last entry (LRU policy) */
-       cte = list_last_entry(&dist->lpi_translation_cache,
-                             typeof(*cte), entry);
-
-       /*
-        * Caching the translation implies having an extra reference
-        * to the interrupt, so drop the potential reference on what
-        * was in the cache, and increment it on the new interrupt.
-        */
-       if (cte->irq)
-               __vgic_put_lpi_locked(kvm, cte->irq);
-
-       vgic_get_irq_kref(irq);
-
-       cte->db         = db;
-       cte->devid      = devid;
-       cte->eventid    = eventid;
-       cte->irq        = irq;
-
-       /* Move the new translation to the head of the list */
-       list_move(&cte->entry, &dist->lpi_translation_cache);
-
-out:
-       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-}
-
-void vgic_its_invalidate_cache(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct vgic_translation_cache_entry *cte;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-       list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
-               /*
-                * If we hit a NULL entry, there is nothing after this
-                * point.
-                */
-               if (!cte->irq)
-                       break;
-
-               __vgic_put_lpi_locked(kvm, cte->irq);
-               cte->irq = NULL;
-       }
-
-       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-}
-
-int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
-                        u32 devid, u32 eventid, struct vgic_irq **irq)
-{
-       struct kvm_vcpu *vcpu;
-       struct its_ite *ite;
-
-       if (!its->enabled)
-               return -EBUSY;
-
-       ite = find_ite(its, devid, eventid);
-       if (!ite || !its_is_collection_mapped(ite->collection))
-               return E_ITS_INT_UNMAPPED_INTERRUPT;
-
-       vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
-       if (!vcpu)
-               return E_ITS_INT_UNMAPPED_INTERRUPT;
-
-       if (!vcpu->arch.vgic_cpu.lpis_enabled)
-               return -EBUSY;
-
-       vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);
-
-       *irq = ite->irq;
-       return 0;
-}
-
-struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi)
-{
-       u64 address;
-       struct kvm_io_device *kvm_io_dev;
-       struct vgic_io_device *iodev;
-
-       if (!vgic_has_its(kvm))
-               return ERR_PTR(-ENODEV);
-
-       if (!(msi->flags & KVM_MSI_VALID_DEVID))
-               return ERR_PTR(-EINVAL);
-
-       address = (u64)msi->address_hi << 32 | msi->address_lo;
-
-       kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
-       if (!kvm_io_dev)
-               return ERR_PTR(-EINVAL);
-
-       if (kvm_io_dev->ops != &kvm_io_gic_ops)
-               return ERR_PTR(-EINVAL);
-
-       iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
-       if (iodev->iodev_type != IODEV_ITS)
-               return ERR_PTR(-EINVAL);
-
-       return iodev->its;
-}
-
-/*
- * Find the target VCPU and the LPI number for a given devid/eventid pair
- * and make this IRQ pending, possibly injecting it.
- * Must be called with the its_lock mutex held.
- * Returns 0 on success, a positive error value for any ITS mapping
- * related errors and negative error values for generic errors.
- */
-static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
-                               u32 devid, u32 eventid)
-{
-       struct vgic_irq *irq = NULL;
-       unsigned long flags;
-       int err;
-
-       err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq);
-       if (err)
-               return err;
-
-       if (irq->hw)
-               return irq_set_irqchip_state(irq->host_irq,
-                                            IRQCHIP_STATE_PENDING, true);
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-       irq->pending_latch = true;
-       vgic_queue_irq_unlock(kvm, irq, flags);
-
-       return 0;
-}
-
-int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi)
-{
-       struct vgic_irq *irq;
-       unsigned long flags;
-       phys_addr_t db;
-
-       db = (u64)msi->address_hi << 32 | msi->address_lo;
-       irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data);
-
-       if (!irq)
-               return -1;
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-       irq->pending_latch = true;
-       vgic_queue_irq_unlock(kvm, irq, flags);
-
-       return 0;
-}
-
-/*
- * Queries the KVM IO bus framework to get the ITS pointer from the given
- * doorbell address.
- * We then call vgic_its_trigger_msi() with the decoded data.
- * According to the KVM_SIGNAL_MSI API description returns 1 on success.
- */
-int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
-{
-       struct vgic_its *its;
-       int ret;
-
-       if (!vgic_its_inject_cached_translation(kvm, msi))
-               return 1;
-
-       its = vgic_msi_to_its(kvm, msi);
-       if (IS_ERR(its))
-               return PTR_ERR(its);
-
-       mutex_lock(&its->its_lock);
-       ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data);
-       mutex_unlock(&its->its_lock);
-
-       if (ret < 0)
-               return ret;
-
-       /*
-        * KVM_SIGNAL_MSI demands a return value > 0 for success and 0
-        * if the guest has blocked the MSI. So we map any LPI mapping
-        * related error to that.
-        */
-       if (ret)
-               return 0;
-       else
-               return 1;
-}
-
-/* Requires the its_lock to be held. */
-static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
-{
-       list_del(&ite->ite_list);
-
-       /* This put matches the get in vgic_add_lpi. */
-       if (ite->irq) {
-               if (ite->irq->hw)
-                       WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
-
-               vgic_put_irq(kvm, ite->irq);
-       }
-
-       kfree(ite);
-}
-
-static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
-{
-       return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
-}
-
-#define its_cmd_get_command(cmd)       its_cmd_mask_field(cmd, 0,  0,  8)
-#define its_cmd_get_deviceid(cmd)      its_cmd_mask_field(cmd, 0, 32, 32)
-#define its_cmd_get_size(cmd)          (its_cmd_mask_field(cmd, 1,  0,  5) + 1)
-#define its_cmd_get_id(cmd)            its_cmd_mask_field(cmd, 1,  0, 32)
-#define its_cmd_get_physical_id(cmd)   its_cmd_mask_field(cmd, 1, 32, 32)
-#define its_cmd_get_collection(cmd)    its_cmd_mask_field(cmd, 2,  0, 16)
-#define its_cmd_get_ittaddr(cmd)       (its_cmd_mask_field(cmd, 2,  8, 44) << 8)
-#define its_cmd_get_target_addr(cmd)   its_cmd_mask_field(cmd, 2, 16, 32)
-#define its_cmd_get_validbit(cmd)      its_cmd_mask_field(cmd, 2, 63,  1)
-
-/*
- * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
-                                      u64 *its_cmd)
-{
-       u32 device_id = its_cmd_get_deviceid(its_cmd);
-       u32 event_id = its_cmd_get_id(its_cmd);
-       struct its_ite *ite;
-
-       ite = find_ite(its, device_id, event_id);
-       if (ite && its_is_collection_mapped(ite->collection)) {
-               /*
-                * Though the spec talks about removing the pending state, we
-                * don't bother here since we clear the ITTE anyway and the
-                * pending state is a property of the ITTE struct.
-                */
-               vgic_its_invalidate_cache(kvm);
-
-               its_free_ite(kvm, ite);
-               return 0;
-       }
-
-       return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
-}
-
-/*
- * The MOVI command moves an ITTE to a different collection.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
-                                   u64 *its_cmd)
-{
-       u32 device_id = its_cmd_get_deviceid(its_cmd);
-       u32 event_id = its_cmd_get_id(its_cmd);
-       u32 coll_id = its_cmd_get_collection(its_cmd);
-       struct kvm_vcpu *vcpu;
-       struct its_ite *ite;
-       struct its_collection *collection;
-
-       ite = find_ite(its, device_id, event_id);
-       if (!ite)
-               return E_ITS_MOVI_UNMAPPED_INTERRUPT;
-
-       if (!its_is_collection_mapped(ite->collection))
-               return E_ITS_MOVI_UNMAPPED_COLLECTION;
-
-       collection = find_collection(its, coll_id);
-       if (!its_is_collection_mapped(collection))
-               return E_ITS_MOVI_UNMAPPED_COLLECTION;
-
-       ite->collection = collection;
-       vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-       vgic_its_invalidate_cache(kvm);
-
-       return update_affinity(ite->irq, vcpu);
-}
-
-/*
- * Check whether an ID can be stored into the corresponding guest table.
- * For a direct table this is pretty easy, but gets a bit nasty for
- * indirect tables. We check whether the resulting guest physical address
- * is actually valid (covered by a memslot and guest accessible).
- * For this we have to read the respective first level entry.
- */
-static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
-                             gpa_t *eaddr)
-{
-       int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-       u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
-       phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
-       int esz = GITS_BASER_ENTRY_SIZE(baser);
-       int index, idx;
-       gfn_t gfn;
-       bool ret;
-
-       switch (type) {
-       case GITS_BASER_TYPE_DEVICE:
-               if (id >= BIT_ULL(VITS_TYPER_DEVBITS))
-                       return false;
-               break;
-       case GITS_BASER_TYPE_COLLECTION:
-               /* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */
-               if (id >= BIT_ULL(16))
-                       return false;
-               break;
-       default:
-               return false;
-       }
-
-       if (!(baser & GITS_BASER_INDIRECT)) {
-               phys_addr_t addr;
-
-               if (id >= (l1_tbl_size / esz))
-                       return false;
-
-               addr = base + id * esz;
-               gfn = addr >> PAGE_SHIFT;
-
-               if (eaddr)
-                       *eaddr = addr;
-
-               goto out;
-       }
-
-       /* calculate and check the index into the 1st level */
-       index = id / (SZ_64K / esz);
-       if (index >= (l1_tbl_size / sizeof(u64)))
-               return false;
-
-       /* Each 1st level entry is represented by a 64-bit value. */
-       if (kvm_read_guest_lock(its->dev->kvm,
-                          base + index * sizeof(indirect_ptr),
-                          &indirect_ptr, sizeof(indirect_ptr)))
-               return false;
-
-       indirect_ptr = le64_to_cpu(indirect_ptr);
-
-       /* check the valid bit of the first level entry */
-       if (!(indirect_ptr & BIT_ULL(63)))
-               return false;
-
-       /* Mask the guest physical address and calculate the frame number. */
-       indirect_ptr &= GENMASK_ULL(51, 16);
-
-       /* Find the address of the actual entry */
-       index = id % (SZ_64K / esz);
-       indirect_ptr += index * esz;
-       gfn = indirect_ptr >> PAGE_SHIFT;
-
-       if (eaddr)
-               *eaddr = indirect_ptr;
-
-out:
-       idx = srcu_read_lock(&its->dev->kvm->srcu);
-       ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
-       srcu_read_unlock(&its->dev->kvm->srcu, idx);
-       return ret;
-}
-
-static int vgic_its_alloc_collection(struct vgic_its *its,
-                                    struct its_collection **colp,
-                                    u32 coll_id)
-{
-       struct its_collection *collection;
-
-       if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL))
-               return E_ITS_MAPC_COLLECTION_OOR;
-
-       collection = kzalloc(sizeof(*collection), GFP_KERNEL);
-       if (!collection)
-               return -ENOMEM;
-
-       collection->collection_id = coll_id;
-       collection->target_addr = COLLECTION_NOT_MAPPED;
-
-       list_add_tail(&collection->coll_list, &its->collection_list);
-       *colp = collection;
-
-       return 0;
-}
-
-static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
-{
-       struct its_collection *collection;
-       struct its_device *device;
-       struct its_ite *ite;
-
-       /*
-        * Clearing the mapping for that collection ID removes the
-        * entry from the list. If there wasn't any before, we can
-        * go home early.
-        */
-       collection = find_collection(its, coll_id);
-       if (!collection)
-               return;
-
-       for_each_lpi_its(device, ite, its)
-               if (ite->collection &&
-                   ite->collection->collection_id == coll_id)
-                       ite->collection = NULL;
-
-       list_del(&collection->coll_list);
-       kfree(collection);
-}
-
-/* Must be called with its_lock mutex held */
-static struct its_ite *vgic_its_alloc_ite(struct its_device *device,
-                                         struct its_collection *collection,
-                                         u32 event_id)
-{
-       struct its_ite *ite;
-
-       ite = kzalloc(sizeof(*ite), GFP_KERNEL);
-       if (!ite)
-               return ERR_PTR(-ENOMEM);
-
-       ite->event_id   = event_id;
-       ite->collection = collection;
-
-       list_add_tail(&ite->ite_list, &device->itt_head);
-       return ite;
-}
-
-/*
- * The MAPTI and MAPI commands map LPIs to ITTEs.
- * Must be called with its_lock mutex held.
- */
-static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
-                                   u64 *its_cmd)
-{
-       u32 device_id = its_cmd_get_deviceid(its_cmd);
-       u32 event_id = its_cmd_get_id(its_cmd);
-       u32 coll_id = its_cmd_get_collection(its_cmd);
-       struct its_ite *ite;
-       struct kvm_vcpu *vcpu = NULL;
-       struct its_device *device;
-       struct its_collection *collection, *new_coll = NULL;
-       struct vgic_irq *irq;
-       int lpi_nr;
-
-       device = find_its_device(its, device_id);
-       if (!device)
-               return E_ITS_MAPTI_UNMAPPED_DEVICE;
-
-       if (event_id >= BIT_ULL(device->num_eventid_bits))
-               return E_ITS_MAPTI_ID_OOR;
-
-       if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
-               lpi_nr = its_cmd_get_physical_id(its_cmd);
-       else
-               lpi_nr = event_id;
-       if (lpi_nr < GIC_LPI_OFFSET ||
-           lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
-               return E_ITS_MAPTI_PHYSICALID_OOR;
-
-       /* If there is an existing mapping, behavior is UNPREDICTABLE. */
-       if (find_ite(its, device_id, event_id))
-               return 0;
-
-       collection = find_collection(its, coll_id);
-       if (!collection) {
-               int ret = vgic_its_alloc_collection(its, &collection, coll_id);
-               if (ret)
-                       return ret;
-               new_coll = collection;
-       }
-
-       ite = vgic_its_alloc_ite(device, collection, event_id);
-       if (IS_ERR(ite)) {
-               if (new_coll)
-                       vgic_its_free_collection(its, coll_id);
-               return PTR_ERR(ite);
-       }
-
-       if (its_is_collection_mapped(collection))
-               vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-       irq = vgic_add_lpi(kvm, lpi_nr, vcpu);
-       if (IS_ERR(irq)) {
-               if (new_coll)
-                       vgic_its_free_collection(its, coll_id);
-               its_free_ite(kvm, ite);
-               return PTR_ERR(irq);
-       }
-       ite->irq = irq;
-
-       return 0;
-}
-
-/* Requires the its_lock to be held. */
-static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
-{
-       struct its_ite *ite, *temp;
-
-       /*
-        * The spec says that unmapping a device with still valid
-        * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
-        * since we cannot leave the memory unreferenced.
-        */
-       list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list)
-               its_free_ite(kvm, ite);
-
-       vgic_its_invalidate_cache(kvm);
-
-       list_del(&device->dev_list);
-       kfree(device);
-}
-
-/* its lock must be held */
-static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
-{
-       struct its_device *cur, *temp;
-
-       list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
-               vgic_its_free_device(kvm, cur);
-}
-
-/* its lock must be held */
-static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its)
-{
-       struct its_collection *cur, *temp;
-
-       list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list)
-               vgic_its_free_collection(its, cur->collection_id);
-}
-
-/* Must be called with its_lock mutex held */
-static struct its_device *vgic_its_alloc_device(struct vgic_its *its,
-                                               u32 device_id, gpa_t itt_addr,
-                                               u8 num_eventid_bits)
-{
-       struct its_device *device;
-
-       device = kzalloc(sizeof(*device), GFP_KERNEL);
-       if (!device)
-               return ERR_PTR(-ENOMEM);
-
-       device->device_id = device_id;
-       device->itt_addr = itt_addr;
-       device->num_eventid_bits = num_eventid_bits;
-       INIT_LIST_HEAD(&device->itt_head);
-
-       list_add_tail(&device->dev_list, &its->device_list);
-       return device;
-}
-
-/*
- * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
-                                   u64 *its_cmd)
-{
-       u32 device_id = its_cmd_get_deviceid(its_cmd);
-       bool valid = its_cmd_get_validbit(its_cmd);
-       u8 num_eventid_bits = its_cmd_get_size(its_cmd);
-       gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd);
-       struct its_device *device;
-
-       if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL))
-               return E_ITS_MAPD_DEVICE_OOR;
-
-       if (valid && num_eventid_bits > VITS_TYPER_IDBITS)
-               return E_ITS_MAPD_ITTSIZE_OOR;
-
-       device = find_its_device(its, device_id);
-
-       /*
-        * The spec says that calling MAPD on an already mapped device
-        * invalidates all cached data for this device. We implement this
-        * by removing the mapping and re-establishing it.
-        */
-       if (device)
-               vgic_its_free_device(kvm, device);
-
-       /*
-        * The spec does not say whether unmapping a not-mapped device
-        * is an error, so we are done in any case.
-        */
-       if (!valid)
-               return 0;
-
-       device = vgic_its_alloc_device(its, device_id, itt_addr,
-                                      num_eventid_bits);
-
-       return PTR_ERR_OR_ZERO(device);
-}
-
-/*
- * The MAPC command maps collection IDs to redistributors.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
-                                   u64 *its_cmd)
-{
-       u16 coll_id;
-       u32 target_addr;
-       struct its_collection *collection;
-       bool valid;
-
-       valid = its_cmd_get_validbit(its_cmd);
-       coll_id = its_cmd_get_collection(its_cmd);
-       target_addr = its_cmd_get_target_addr(its_cmd);
-
-       if (target_addr >= atomic_read(&kvm->online_vcpus))
-               return E_ITS_MAPC_PROCNUM_OOR;
-
-       if (!valid) {
-               vgic_its_free_collection(its, coll_id);
-               vgic_its_invalidate_cache(kvm);
-       } else {
-               collection = find_collection(its, coll_id);
-
-               if (!collection) {
-                       int ret;
-
-                       ret = vgic_its_alloc_collection(its, &collection,
-                                                       coll_id);
-                       if (ret)
-                               return ret;
-                       collection->target_addr = target_addr;
-               } else {
-                       collection->target_addr = target_addr;
-                       update_affinity_collection(kvm, its, collection);
-               }
-       }
-
-       return 0;
-}
-
-/*
- * The CLEAR command removes the pending state for a particular LPI.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
-                                    u64 *its_cmd)
-{
-       u32 device_id = its_cmd_get_deviceid(its_cmd);
-       u32 event_id = its_cmd_get_id(its_cmd);
-       struct its_ite *ite;
-
-
-       ite = find_ite(its, device_id, event_id);
-       if (!ite)
-               return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
-
-       ite->irq->pending_latch = false;
-
-       if (ite->irq->hw)
-               return irq_set_irqchip_state(ite->irq->host_irq,
-                                            IRQCHIP_STATE_PENDING, false);
-
-       return 0;
-}
-
-/*
- * The INV command syncs the configuration bits from the memory table.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
-                                  u64 *its_cmd)
-{
-       u32 device_id = its_cmd_get_deviceid(its_cmd);
-       u32 event_id = its_cmd_get_id(its_cmd);
-       struct its_ite *ite;
-
-
-       ite = find_ite(its, device_id, event_id);
-       if (!ite)
-               return E_ITS_INV_UNMAPPED_INTERRUPT;
-
-       return update_lpi_config(kvm, ite->irq, NULL, true);
-}
-
-/*
- * The INVALL command requests flushing of all IRQ data in this collection.
- * Find the VCPU mapped to that collection, then iterate over the VM's list
- * of mapped LPIs and update the configuration for each IRQ which targets
- * the specified vcpu. The configuration will be read from the in-memory
- * configuration table.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
-                                     u64 *its_cmd)
-{
-       u32 coll_id = its_cmd_get_collection(its_cmd);
-       struct its_collection *collection;
-       struct kvm_vcpu *vcpu;
-       struct vgic_irq *irq;
-       u32 *intids;
-       int irq_count, i;
-
-       collection = find_collection(its, coll_id);
-       if (!its_is_collection_mapped(collection))
-               return E_ITS_INVALL_UNMAPPED_COLLECTION;
-
-       vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-       irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids);
-       if (irq_count < 0)
-               return irq_count;
-
-       for (i = 0; i < irq_count; i++) {
-               irq = vgic_get_irq(kvm, NULL, intids[i]);
-               if (!irq)
-                       continue;
-               update_lpi_config(kvm, irq, vcpu, false);
-               vgic_put_irq(kvm, irq);
-       }
-
-       kfree(intids);
-
-       if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm)
-               its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe);
-
-       return 0;
-}
-
-/*
- * The MOVALL command moves the pending state of all IRQs targeting one
- * redistributor to another. We don't hold the pending state in the VCPUs,
- * but in the IRQs instead, so there is really not much to do for us here.
- * However the spec says that no IRQ must target the old redistributor
- * afterwards, so we make sure that no LPI is using the associated target_vcpu.
- * This command affects all LPIs in the system that target that redistributor.
- */
-static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
-                                     u64 *its_cmd)
-{
-       u32 target1_addr = its_cmd_get_target_addr(its_cmd);
-       u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
-       struct kvm_vcpu *vcpu1, *vcpu2;
-       struct vgic_irq *irq;
-       u32 *intids;
-       int irq_count, i;
-
-       if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
-           target2_addr >= atomic_read(&kvm->online_vcpus))
-               return E_ITS_MOVALL_PROCNUM_OOR;
-
-       if (target1_addr == target2_addr)
-               return 0;
-
-       vcpu1 = kvm_get_vcpu(kvm, target1_addr);
-       vcpu2 = kvm_get_vcpu(kvm, target2_addr);
-
-       irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids);
-       if (irq_count < 0)
-               return irq_count;
-
-       for (i = 0; i < irq_count; i++) {
-               irq = vgic_get_irq(kvm, NULL, intids[i]);
-
-               update_affinity(irq, vcpu2);
-
-               vgic_put_irq(kvm, irq);
-       }
-
-       vgic_its_invalidate_cache(kvm);
-
-       kfree(intids);
-       return 0;
-}
-
-/*
- * The INT command injects the LPI associated with that DevID/EvID pair.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
-                                  u64 *its_cmd)
-{
-       u32 msi_data = its_cmd_get_id(its_cmd);
-       u64 msi_devid = its_cmd_get_deviceid(its_cmd);
-
-       return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
-}
-
-/*
- * This function is called with the its_cmd lock held, but the ITS data
- * structure lock dropped.
- */
-static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
-                                  u64 *its_cmd)
-{
-       int ret = -ENODEV;
-
-       mutex_lock(&its->its_lock);
-       switch (its_cmd_get_command(its_cmd)) {
-       case GITS_CMD_MAPD:
-               ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
-               break;
-       case GITS_CMD_MAPC:
-               ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
-               break;
-       case GITS_CMD_MAPI:
-               ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
-               break;
-       case GITS_CMD_MAPTI:
-               ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
-               break;
-       case GITS_CMD_MOVI:
-               ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
-               break;
-       case GITS_CMD_DISCARD:
-               ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
-               break;
-       case GITS_CMD_CLEAR:
-               ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
-               break;
-       case GITS_CMD_MOVALL:
-               ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
-               break;
-       case GITS_CMD_INT:
-               ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
-               break;
-       case GITS_CMD_INV:
-               ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
-               break;
-       case GITS_CMD_INVALL:
-               ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
-               break;
-       case GITS_CMD_SYNC:
-               /* we ignore this command: we are in sync all of the time */
-               ret = 0;
-               break;
-       }
-       mutex_unlock(&its->its_lock);
-
-       return ret;
-}
-
-static u64 vgic_sanitise_its_baser(u64 reg)
-{
-       reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
-                                 GITS_BASER_SHAREABILITY_SHIFT,
-                                 vgic_sanitise_shareability);
-       reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
-                                 GITS_BASER_INNER_CACHEABILITY_SHIFT,
-                                 vgic_sanitise_inner_cacheability);
-       reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
-                                 GITS_BASER_OUTER_CACHEABILITY_SHIFT,
-                                 vgic_sanitise_outer_cacheability);
-
-       /* We support only one (ITS) page size: 64K */
-       reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
-
-       return reg;
-}
-
-static u64 vgic_sanitise_its_cbaser(u64 reg)
-{
-       reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
-                                 GITS_CBASER_SHAREABILITY_SHIFT,
-                                 vgic_sanitise_shareability);
-       reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
-                                 GITS_CBASER_INNER_CACHEABILITY_SHIFT,
-                                 vgic_sanitise_inner_cacheability);
-       reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
-                                 GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
-                                 vgic_sanitise_outer_cacheability);
-
-       /* Sanitise the physical address to be 64k aligned. */
-       reg &= ~GENMASK_ULL(15, 12);
-
-       return reg;
-}
-
-static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
-                                              struct vgic_its *its,
-                                              gpa_t addr, unsigned int len)
-{
-       return extract_bytes(its->cbaser, addr & 7, len);
-}
-
-static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
-                                      gpa_t addr, unsigned int len,
-                                      unsigned long val)
-{
-       /* When GITS_CTLR.Enable is 1, this register is RO. */
-       if (its->enabled)
-               return;
-
-       mutex_lock(&its->cmd_lock);
-       its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
-       its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
-       its->creadr = 0;
-       /*
-        * CWRITER is architecturally UNKNOWN on reset, but we need to reset
-        * it to CREADR to make sure we start with an empty command buffer.
-        */
-       its->cwriter = its->creadr;
-       mutex_unlock(&its->cmd_lock);
-}
-
-#define ITS_CMD_BUFFER_SIZE(baser)     ((((baser) & 0xff) + 1) << 12)
-#define ITS_CMD_SIZE                   32
-#define ITS_CMD_OFFSET(reg)            ((reg) & GENMASK(19, 5))
-
-/* Must be called with the cmd_lock held. */
-static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its)
-{
-       gpa_t cbaser;
-       u64 cmd_buf[4];
-
-       /* Commands are only processed when the ITS is enabled. */
-       if (!its->enabled)
-               return;
-
-       cbaser = GITS_CBASER_ADDRESS(its->cbaser);
-
-       while (its->cwriter != its->creadr) {
-               int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr,
-                                             cmd_buf, ITS_CMD_SIZE);
-               /*
-                * If kvm_read_guest() fails, this could be due to the guest
-                * programming a bogus value in CBASER or something else going
-                * wrong from which we cannot easily recover.
-                * According to section 6.3.2 in the GICv3 spec we can just
-                * ignore that command then.
-                */
-               if (!ret)
-                       vgic_its_handle_command(kvm, its, cmd_buf);
-
-               its->creadr += ITS_CMD_SIZE;
-               if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
-                       its->creadr = 0;
-       }
-}
-
-/*
- * By writing to CWRITER the guest announces new commands to be processed.
- * To avoid any races in the first place, we take the its_cmd lock, which
- * protects our ring buffer variables, so that there is only one user
- * per ITS handling commands at a given time.
- */
-static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
-                                       gpa_t addr, unsigned int len,
-                                       unsigned long val)
-{
-       u64 reg;
-
-       if (!its)
-               return;
-
-       mutex_lock(&its->cmd_lock);
-
-       reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
-       reg = ITS_CMD_OFFSET(reg);
-       if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
-               mutex_unlock(&its->cmd_lock);
-               return;
-       }
-       its->cwriter = reg;
-
-       vgic_its_process_commands(kvm, its);
-
-       mutex_unlock(&its->cmd_lock);
-}
-
-static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
-                                               struct vgic_its *its,
-                                               gpa_t addr, unsigned int len)
-{
-       return extract_bytes(its->cwriter, addr & 0x7, len);
-}
-
-static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
-                                              struct vgic_its *its,
-                                              gpa_t addr, unsigned int len)
-{
-       return extract_bytes(its->creadr, addr & 0x7, len);
-}
-
-static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm,
-                                             struct vgic_its *its,
-                                             gpa_t addr, unsigned int len,
-                                             unsigned long val)
-{
-       u32 cmd_offset;
-       int ret = 0;
-
-       mutex_lock(&its->cmd_lock);
-
-       if (its->enabled) {
-               ret = -EBUSY;
-               goto out;
-       }
-
-       cmd_offset = ITS_CMD_OFFSET(val);
-       if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       its->creadr = cmd_offset;
-out:
-       mutex_unlock(&its->cmd_lock);
-       return ret;
-}
-
-#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
-static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
-                                             struct vgic_its *its,
-                                             gpa_t addr, unsigned int len)
-{
-       u64 reg;
-
-       switch (BASER_INDEX(addr)) {
-       case 0:
-               reg = its->baser_device_table;
-               break;
-       case 1:
-               reg = its->baser_coll_table;
-               break;
-       default:
-               reg = 0;
-               break;
-       }
-
-       return extract_bytes(reg, addr & 7, len);
-}
-
-#define GITS_BASER_RO_MASK     (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
-static void vgic_mmio_write_its_baser(struct kvm *kvm,
-                                     struct vgic_its *its,
-                                     gpa_t addr, unsigned int len,
-                                     unsigned long val)
-{
-       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-       u64 entry_size, table_type;
-       u64 reg, *regptr, clearbits = 0;
-
-       /* When GITS_CTLR.Enable is 1, we ignore write accesses. */
-       if (its->enabled)
-               return;
-
-       switch (BASER_INDEX(addr)) {
-       case 0:
-               regptr = &its->baser_device_table;
-               entry_size = abi->dte_esz;
-               table_type = GITS_BASER_TYPE_DEVICE;
-               break;
-       case 1:
-               regptr = &its->baser_coll_table;
-               entry_size = abi->cte_esz;
-               table_type = GITS_BASER_TYPE_COLLECTION;
-               clearbits = GITS_BASER_INDIRECT;
-               break;
-       default:
-               return;
-       }
-
-       reg = update_64bit_reg(*regptr, addr & 7, len, val);
-       reg &= ~GITS_BASER_RO_MASK;
-       reg &= ~clearbits;
-
-       reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
-       reg |= table_type << GITS_BASER_TYPE_SHIFT;
-       reg = vgic_sanitise_its_baser(reg);
-
-       *regptr = reg;
-
-       if (!(reg & GITS_BASER_VALID)) {
-               /* Take the its_lock to prevent a race with a save/restore */
-               mutex_lock(&its->its_lock);
-               switch (table_type) {
-               case GITS_BASER_TYPE_DEVICE:
-                       vgic_its_free_device_list(kvm, its);
-                       break;
-               case GITS_BASER_TYPE_COLLECTION:
-                       vgic_its_free_collection_list(kvm, its);
-                       break;
-               }
-               mutex_unlock(&its->its_lock);
-       }
-}
-
-static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
-                                            struct vgic_its *its,
-                                            gpa_t addr, unsigned int len)
-{
-       u32 reg = 0;
-
-       mutex_lock(&its->cmd_lock);
-       if (its->creadr == its->cwriter)
-               reg |= GITS_CTLR_QUIESCENT;
-       if (its->enabled)
-               reg |= GITS_CTLR_ENABLE;
-       mutex_unlock(&its->cmd_lock);
-
-       return reg;
-}
-
-static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
-                                    gpa_t addr, unsigned int len,
-                                    unsigned long val)
-{
-       mutex_lock(&its->cmd_lock);
-
-       /*
-        * It is UNPREDICTABLE to enable the ITS if any of the CBASER or
-        * device/collection BASER are invalid
-        */
-       if (!its->enabled && (val & GITS_CTLR_ENABLE) &&
-               (!(its->baser_device_table & GITS_BASER_VALID) ||
-                !(its->baser_coll_table & GITS_BASER_VALID) ||
-                !(its->cbaser & GITS_CBASER_VALID)))
-               goto out;
-
-       its->enabled = !!(val & GITS_CTLR_ENABLE);
-       if (!its->enabled)
-               vgic_its_invalidate_cache(kvm);
-
-       /*
-        * Try to process any pending commands. This function bails out early
-        * if the ITS is disabled or no commands have been queued.
-        */
-       vgic_its_process_commands(kvm, its);
-
-out:
-       mutex_unlock(&its->cmd_lock);
-}
-
-#define REGISTER_ITS_DESC(off, rd, wr, length, acc)            \
-{                                                              \
-       .reg_offset = off,                                      \
-       .len = length,                                          \
-       .access_flags = acc,                                    \
-       .its_read = rd,                                         \
-       .its_write = wr,                                        \
-}
-
-#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\
-{                                                              \
-       .reg_offset = off,                                      \
-       .len = length,                                          \
-       .access_flags = acc,                                    \
-       .its_read = rd,                                         \
-       .its_write = wr,                                        \
-       .uaccess_its_write = uwr,                               \
-}
-
-static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
-                             gpa_t addr, unsigned int len, unsigned long val)
-{
-       /* Ignore */
-}
-
-static struct vgic_register_region its_registers[] = {
-       REGISTER_ITS_DESC(GITS_CTLR,
-               vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_ITS_DESC_UACCESS(GITS_IIDR,
-               vgic_mmio_read_its_iidr, its_mmio_write_wi,
-               vgic_mmio_uaccess_write_its_iidr, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_ITS_DESC(GITS_TYPER,
-               vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
-               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-       REGISTER_ITS_DESC(GITS_CBASER,
-               vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
-               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-       REGISTER_ITS_DESC(GITS_CWRITER,
-               vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
-               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-       REGISTER_ITS_DESC_UACCESS(GITS_CREADR,
-               vgic_mmio_read_its_creadr, its_mmio_write_wi,
-               vgic_mmio_uaccess_write_its_creadr, 8,
-               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-       REGISTER_ITS_DESC(GITS_BASER,
-               vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
-               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-       REGISTER_ITS_DESC(GITS_IDREGS_BASE,
-               vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
-               VGIC_ACCESS_32bit),
-};
-
-/* This is called on setting the LPI enable bit in the redistributor. */
-void vgic_enable_lpis(struct kvm_vcpu *vcpu)
-{
-       if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
-               its_sync_lpi_pending_table(vcpu);
-}
-
-static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its,
-                                  u64 addr)
-{
-       struct vgic_io_device *iodev = &its->iodev;
-       int ret;
-
-       mutex_lock(&kvm->slots_lock);
-       if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
-               ret = -EBUSY;
-               goto out;
-       }
-
-       its->vgic_its_base = addr;
-       iodev->regions = its_registers;
-       iodev->nr_regions = ARRAY_SIZE(its_registers);
-       kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);
-
-       iodev->base_addr = its->vgic_its_base;
-       iodev->iodev_type = IODEV_ITS;
-       iodev->its = its;
-       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
-                                     KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
-out:
-       mutex_unlock(&kvm->slots_lock);
-
-       return ret;
-}
-
-/* Default is 16 cached LPIs per vcpu */
-#define LPI_DEFAULT_PCPU_CACHE_SIZE    16
-
-void vgic_lpi_translation_cache_init(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       unsigned int sz;
-       int i;
-
-       if (!list_empty(&dist->lpi_translation_cache))
-               return;
-
-       sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE;
-
-       for (i = 0; i < sz; i++) {
-               struct vgic_translation_cache_entry *cte;
-
-               /* An allocation failure is not fatal */
-               cte = kzalloc(sizeof(*cte), GFP_KERNEL);
-               if (WARN_ON(!cte))
-                       break;
-
-               INIT_LIST_HEAD(&cte->entry);
-               list_add(&cte->entry, &dist->lpi_translation_cache);
-       }
-}
-
-void vgic_lpi_translation_cache_destroy(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct vgic_translation_cache_entry *cte, *tmp;
-
-       vgic_its_invalidate_cache(kvm);
-
-       list_for_each_entry_safe(cte, tmp,
-                                &dist->lpi_translation_cache, entry) {
-               list_del(&cte->entry);
-               kfree(cte);
-       }
-}
-
-#define INITIAL_BASER_VALUE                                              \
-       (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)                | \
-        GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)         | \
-        GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)             | \
-        GITS_BASER_PAGE_SIZE_64K)
-
-#define INITIAL_PROPBASER_VALUE                                                  \
-       (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb)            | \
-        GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner)     | \
-        GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))
-
-static int vgic_its_create(struct kvm_device *dev, u32 type)
-{
-       struct vgic_its *its;
-
-       if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
-               return -ENODEV;
-
-       its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
-       if (!its)
-               return -ENOMEM;
-
-       if (vgic_initialized(dev->kvm)) {
-               int ret = vgic_v4_init(dev->kvm);
-               if (ret < 0) {
-                       kfree(its);
-                       return ret;
-               }
-
-               vgic_lpi_translation_cache_init(dev->kvm);
-       }
-
-       mutex_init(&its->its_lock);
-       mutex_init(&its->cmd_lock);
-
-       its->vgic_its_base = VGIC_ADDR_UNDEF;
-
-       INIT_LIST_HEAD(&its->device_list);
-       INIT_LIST_HEAD(&its->collection_list);
-
-       dev->kvm->arch.vgic.msis_require_devid = true;
-       dev->kvm->arch.vgic.has_its = true;
-       its->enabled = false;
-       its->dev = dev;
-
-       its->baser_device_table = INITIAL_BASER_VALUE                   |
-               ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
-       its->baser_coll_table = INITIAL_BASER_VALUE |
-               ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
-       dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;
-
-       dev->private = its;
-
-       return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
-}
-
-static void vgic_its_destroy(struct kvm_device *kvm_dev)
-{
-       struct kvm *kvm = kvm_dev->kvm;
-       struct vgic_its *its = kvm_dev->private;
-
-       mutex_lock(&its->its_lock);
-
-       vgic_its_free_device_list(kvm, its);
-       vgic_its_free_collection_list(kvm, its);
-
-       mutex_unlock(&its->its_lock);
-       kfree(its);
-       kfree(kvm_dev);/* alloc by kvm_ioctl_create_device, free by .destroy */
-}
-
-static int vgic_its_has_attr_regs(struct kvm_device *dev,
-                                 struct kvm_device_attr *attr)
-{
-       const struct vgic_register_region *region;
-       gpa_t offset = attr->attr;
-       int align;
-
-       align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7;
-
-       if (offset & align)
-               return -EINVAL;
-
-       region = vgic_find_mmio_region(its_registers,
-                                      ARRAY_SIZE(its_registers),
-                                      offset);
-       if (!region)
-               return -ENXIO;
-
-       return 0;
-}
-
-static int vgic_its_attr_regs_access(struct kvm_device *dev,
-                                    struct kvm_device_attr *attr,
-                                    u64 *reg, bool is_write)
-{
-       const struct vgic_register_region *region;
-       struct vgic_its *its;
-       gpa_t addr, offset;
-       unsigned int len;
-       int align, ret = 0;
-
-       its = dev->private;
-       offset = attr->attr;
-
-       /*
-        * Although the spec supports upper/lower 32-bit accesses to
-        * 64-bit ITS registers, the userspace ABI requires 64-bit
-        * accesses to all 64-bit wide registers. We therefore only
-        * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID
-        * registers
-        */
-       if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4))
-               align = 0x3;
-       else
-               align = 0x7;
-
-       if (offset & align)
-               return -EINVAL;
-
-       mutex_lock(&dev->kvm->lock);
-
-       if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
-               ret = -ENXIO;
-               goto out;
-       }
-
-       region = vgic_find_mmio_region(its_registers,
-                                      ARRAY_SIZE(its_registers),
-                                      offset);
-       if (!region) {
-               ret = -ENXIO;
-               goto out;
-       }
-
-       if (!lock_all_vcpus(dev->kvm)) {
-               ret = -EBUSY;
-               goto out;
-       }
-
-       addr = its->vgic_its_base + offset;
-
-       len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4;
-
-       if (is_write) {
-               if (region->uaccess_its_write)
-                       ret = region->uaccess_its_write(dev->kvm, its, addr,
-                                                       len, *reg);
-               else
-                       region->its_write(dev->kvm, its, addr, len, *reg);
-       } else {
-               *reg = region->its_read(dev->kvm, its, addr, len);
-       }
-       unlock_all_vcpus(dev->kvm);
-out:
-       mutex_unlock(&dev->kvm->lock);
-       return ret;
-}
-
-static u32 compute_next_devid_offset(struct list_head *h,
-                                    struct its_device *dev)
-{
-       struct its_device *next;
-       u32 next_offset;
-
-       if (list_is_last(&dev->dev_list, h))
-               return 0;
-       next = list_next_entry(dev, dev_list);
-       next_offset = next->device_id - dev->device_id;
-
-       return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET);
-}
-
-static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite)
-{
-       struct its_ite *next;
-       u32 next_offset;
-
-       if (list_is_last(&ite->ite_list, h))
-               return 0;
-       next = list_next_entry(ite, ite_list);
-       next_offset = next->event_id - ite->event_id;
-
-       return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET);
-}
-
-/**
- * entry_fn_t - Callback called on a table entry restore path
- * @its: its handle
- * @id: id of the entry
- * @entry: pointer to the entry
- * @opaque: pointer to an opaque data
- *
- * Return: < 0 on error, 0 if last element was identified, id offset to next
- * element otherwise
- */
-typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry,
-                         void *opaque);
-
-/**
- * scan_its_table - Scan a contiguous table in guest RAM and applies a function
- * to each entry
- *
- * @its: its handle
- * @base: base gpa of the table
- * @size: size of the table in bytes
- * @esz: entry size in bytes
- * @start_id: the ID of the first entry in the table
- * (non zero for 2d level tables)
- * @fn: function to apply on each entry
- *
- * Return: < 0 on error, 0 if last element was identified, 1 otherwise
- * (the last element may not be found on second level tables)
- */
-static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz,
-                         int start_id, entry_fn_t fn, void *opaque)
-{
-       struct kvm *kvm = its->dev->kvm;
-       unsigned long len = size;
-       int id = start_id;
-       gpa_t gpa = base;
-       char entry[ESZ_MAX];
-       int ret;
-
-       memset(entry, 0, esz);
-
-       while (len > 0) {
-               int next_offset;
-               size_t byte_offset;
-
-               ret = kvm_read_guest_lock(kvm, gpa, entry, esz);
-               if (ret)
-                       return ret;
-
-               next_offset = fn(its, id, entry, opaque);
-               if (next_offset <= 0)
-                       return next_offset;
-
-               byte_offset = next_offset * esz;
-               id += next_offset;
-               gpa += byte_offset;
-               len -= byte_offset;
-       }
-       return 1;
-}
-
-/**
- * vgic_its_save_ite - Save an interrupt translation entry at @gpa
- */
-static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
-                             struct its_ite *ite, gpa_t gpa, int ite_esz)
-{
-       struct kvm *kvm = its->dev->kvm;
-       u32 next_offset;
-       u64 val;
-
-       next_offset = compute_next_eventid_offset(&dev->itt_head, ite);
-       val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) |
-              ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
-               ite->collection->collection_id;
-       val = cpu_to_le64(val);
-       return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
-}
-
-/**
- * vgic_its_restore_ite - restore an interrupt translation entry
- * @event_id: id used for indexing
- * @ptr: pointer to the ITE entry
- * @opaque: pointer to the its_device
- */
-static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
-                               void *ptr, void *opaque)
-{
-       struct its_device *dev = (struct its_device *)opaque;
-       struct its_collection *collection;
-       struct kvm *kvm = its->dev->kvm;
-       struct kvm_vcpu *vcpu = NULL;
-       u64 val;
-       u64 *p = (u64 *)ptr;
-       struct vgic_irq *irq;
-       u32 coll_id, lpi_id;
-       struct its_ite *ite;
-       u32 offset;
-
-       val = *p;
-
-       val = le64_to_cpu(val);
-
-       coll_id = val & KVM_ITS_ITE_ICID_MASK;
-       lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT;
-
-       if (!lpi_id)
-               return 1; /* invalid entry, no choice but to scan next entry */
-
-       if (lpi_id < VGIC_MIN_LPI)
-               return -EINVAL;
-
-       offset = val >> KVM_ITS_ITE_NEXT_SHIFT;
-       if (event_id + offset >= BIT_ULL(dev->num_eventid_bits))
-               return -EINVAL;
-
-       collection = find_collection(its, coll_id);
-       if (!collection)
-               return -EINVAL;
-
-       ite = vgic_its_alloc_ite(dev, collection, event_id);
-       if (IS_ERR(ite))
-               return PTR_ERR(ite);
-
-       if (its_is_collection_mapped(collection))
-               vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-       irq = vgic_add_lpi(kvm, lpi_id, vcpu);
-       if (IS_ERR(irq))
-               return PTR_ERR(irq);
-       ite->irq = irq;
-
-       return offset;
-}
-
-static int vgic_its_ite_cmp(void *priv, struct list_head *a,
-                           struct list_head *b)
-{
-       struct its_ite *itea = container_of(a, struct its_ite, ite_list);
-       struct its_ite *iteb = container_of(b, struct its_ite, ite_list);
-
-       if (itea->event_id < iteb->event_id)
-               return -1;
-       else
-               return 1;
-}
-
-static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
-{
-       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-       gpa_t base = device->itt_addr;
-       struct its_ite *ite;
-       int ret;
-       int ite_esz = abi->ite_esz;
-
-       list_sort(NULL, &device->itt_head, vgic_its_ite_cmp);
-
-       list_for_each_entry(ite, &device->itt_head, ite_list) {
-               gpa_t gpa = base + ite->event_id * ite_esz;
-
-               /*
-                * If an LPI carries the HW bit, this means that this
-                * interrupt is controlled by GICv4, and we do not
-                * have direct access to that state. Let's simply fail
-                * the save operation...
-                */
-               if (ite->irq->hw)
-                       return -EACCES;
-
-               ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
-               if (ret)
-                       return ret;
-       }
-       return 0;
-}
-
-/**
- * vgic_its_restore_itt - restore the ITT of a device
- *
- * @its: its handle
- * @dev: device handle
- *
- * Return 0 on success, < 0 on error
- */
-static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev)
-{
-       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-       gpa_t base = dev->itt_addr;
-       int ret;
-       int ite_esz = abi->ite_esz;
-       size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz;
-
-       ret = scan_its_table(its, base, max_size, ite_esz, 0,
-                            vgic_its_restore_ite, dev);
-
-       /* scan_its_table returns +1 if all ITEs are invalid */
-       if (ret > 0)
-               ret = 0;
-
-       return ret;
-}
-
-/**
- * vgic_its_save_dte - Save a device table entry at a given GPA
- *
- * @its: ITS handle
- * @dev: ITS device
- * @ptr: GPA
- */
-static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
-                            gpa_t ptr, int dte_esz)
-{
-       struct kvm *kvm = its->dev->kvm;
-       u64 val, itt_addr_field;
-       u32 next_offset;
-
-       itt_addr_field = dev->itt_addr >> 8;
-       next_offset = compute_next_devid_offset(&its->device_list, dev);
-       val = (1ULL << KVM_ITS_DTE_VALID_SHIFT |
-              ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) |
-              (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
-               (dev->num_eventid_bits - 1));
-       val = cpu_to_le64(val);
-       return kvm_write_guest_lock(kvm, ptr, &val, dte_esz);
-}
-
-/**
- * vgic_its_restore_dte - restore a device table entry
- *
- * @its: its handle
- * @id: device id the DTE corresponds to
- * @ptr: kernel VA where the 8 byte DTE is located
- * @opaque: unused
- *
- * Return: < 0 on error, 0 if the dte is the last one, id offset to the
- * next dte otherwise
- */
-static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
-                               void *ptr, void *opaque)
-{
-       struct its_device *dev;
-       gpa_t itt_addr;
-       u8 num_eventid_bits;
-       u64 entry = *(u64 *)ptr;
-       bool valid;
-       u32 offset;
-       int ret;
-
-       entry = le64_to_cpu(entry);
-
-       valid = entry >> KVM_ITS_DTE_VALID_SHIFT;
-       num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1;
-       itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK)
-                       >> KVM_ITS_DTE_ITTADDR_SHIFT) << 8;
-
-       if (!valid)
-               return 1;
-
-       /* dte entry is valid */
-       offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT;
-
-       dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits);
-       if (IS_ERR(dev))
-               return PTR_ERR(dev);
-
-       ret = vgic_its_restore_itt(its, dev);
-       if (ret) {
-               vgic_its_free_device(its->dev->kvm, dev);
-               return ret;
-       }
-
-       return offset;
-}
-
-static int vgic_its_device_cmp(void *priv, struct list_head *a,
-                              struct list_head *b)
-{
-       struct its_device *deva = container_of(a, struct its_device, dev_list);
-       struct its_device *devb = container_of(b, struct its_device, dev_list);
-
-       if (deva->device_id < devb->device_id)
-               return -1;
-       else
-               return 1;
-}
-
-/**
- * vgic_its_save_device_tables - Save the device table and all ITT
- * into guest RAM
- *
- * L1/L2 handling is hidden by vgic_its_check_id() helper which directly
- * returns the GPA of the device entry
- */
-static int vgic_its_save_device_tables(struct vgic_its *its)
-{
-       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-       u64 baser = its->baser_device_table;
-       struct its_device *dev;
-       int dte_esz = abi->dte_esz;
-
-       if (!(baser & GITS_BASER_VALID))
-               return 0;
-
-       list_sort(NULL, &its->device_list, vgic_its_device_cmp);
-
-       list_for_each_entry(dev, &its->device_list, dev_list) {
-               int ret;
-               gpa_t eaddr;
-
-               if (!vgic_its_check_id(its, baser,
-                                      dev->device_id, &eaddr))
-                       return -EINVAL;
-
-               ret = vgic_its_save_itt(its, dev);
-               if (ret)
-                       return ret;
-
-               ret = vgic_its_save_dte(its, dev, eaddr, dte_esz);
-               if (ret)
-                       return ret;
-       }
-       return 0;
-}
-
-/**
- * handle_l1_dte - callback used for L1 device table entries (2 stage case)
- *
- * @its: its handle
- * @id: index of the entry in the L1 table
- * @addr: kernel VA
- * @opaque: unused
- *
- * L1 table entries are scanned by steps of 1 entry
- * Return < 0 if error, 0 if last dte was found when scanning the L2
- * table, +1 otherwise (meaning next L1 entry must be scanned)
- */
-static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr,
-                        void *opaque)
-{
-       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-       int l2_start_id = id * (SZ_64K / abi->dte_esz);
-       u64 entry = *(u64 *)addr;
-       int dte_esz = abi->dte_esz;
-       gpa_t gpa;
-       int ret;
-
-       entry = le64_to_cpu(entry);
-
-       if (!(entry & KVM_ITS_L1E_VALID_MASK))
-               return 1;
-
-       gpa = entry & KVM_ITS_L1E_ADDR_MASK;
-
-       ret = scan_its_table(its, gpa, SZ_64K, dte_esz,
-                            l2_start_id, vgic_its_restore_dte, NULL);
-
-       return ret;
-}
-
-/**
- * vgic_its_restore_device_tables - Restore the device table and all ITT
- * from guest RAM to internal data structs
- */
-static int vgic_its_restore_device_tables(struct vgic_its *its)
-{
-       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-       u64 baser = its->baser_device_table;
-       int l1_esz, ret;
-       int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-       gpa_t l1_gpa;
-
-       if (!(baser & GITS_BASER_VALID))
-               return 0;
-
-       l1_gpa = GITS_BASER_ADDR_48_to_52(baser);
-
-       if (baser & GITS_BASER_INDIRECT) {
-               l1_esz = GITS_LVL1_ENTRY_SIZE;
-               ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
-                                    handle_l1_dte, NULL);
-       } else {
-               l1_esz = abi->dte_esz;
-               ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
-                                    vgic_its_restore_dte, NULL);
-       }
-
-       /* scan_its_table returns +1 if all entries are invalid */
-       if (ret > 0)
-               ret = 0;
-
-       return ret;
-}
-
-static int vgic_its_save_cte(struct vgic_its *its,
-                            struct its_collection *collection,
-                            gpa_t gpa, int esz)
-{
-       u64 val;
-
-       val = (1ULL << KVM_ITS_CTE_VALID_SHIFT |
-              ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
-              collection->collection_id);
-       val = cpu_to_le64(val);
-       return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz);
-}
-
-static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
-{
-       struct its_collection *collection;
-       struct kvm *kvm = its->dev->kvm;
-       u32 target_addr, coll_id;
-       u64 val;
-       int ret;
-
-       BUG_ON(esz > sizeof(val));
-       ret = kvm_read_guest_lock(kvm, gpa, &val, esz);
-       if (ret)
-               return ret;
-       val = le64_to_cpu(val);
-       if (!(val & KVM_ITS_CTE_VALID_MASK))
-               return 0;
-
-       target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT);
-       coll_id = val & KVM_ITS_CTE_ICID_MASK;
-
-       if (target_addr != COLLECTION_NOT_MAPPED &&
-           target_addr >= atomic_read(&kvm->online_vcpus))
-               return -EINVAL;
-
-       collection = find_collection(its, coll_id);
-       if (collection)
-               return -EEXIST;
-       ret = vgic_its_alloc_collection(its, &collection, coll_id);
-       if (ret)
-               return ret;
-       collection->target_addr = target_addr;
-       return 1;
-}
-
-/**
- * vgic_its_save_collection_table - Save the collection table into
- * guest RAM
- */
-static int vgic_its_save_collection_table(struct vgic_its *its)
-{
-       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-       u64 baser = its->baser_coll_table;
-       gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser);
-       struct its_collection *collection;
-       u64 val;
-       size_t max_size, filled = 0;
-       int ret, cte_esz = abi->cte_esz;
-
-       if (!(baser & GITS_BASER_VALID))
-               return 0;
-
-       max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-
-       list_for_each_entry(collection, &its->collection_list, coll_list) {
-               ret = vgic_its_save_cte(its, collection, gpa, cte_esz);
-               if (ret)
-                       return ret;
-               gpa += cte_esz;
-               filled += cte_esz;
-       }
-
-       if (filled == max_size)
-               return 0;
-
-       /*
-        * table is not fully filled, add a last dummy element
-        * with valid bit unset
-        */
-       val = 0;
-       BUG_ON(cte_esz > sizeof(val));
-       ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz);
-       return ret;
-}
-
-/**
- * vgic_its_restore_collection_table - reads the collection table
- * in guest memory and restores the ITS internal state. Requires the
- * BASER registers to be restored before.
- */
-static int vgic_its_restore_collection_table(struct vgic_its *its)
-{
-       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-       u64 baser = its->baser_coll_table;
-       int cte_esz = abi->cte_esz;
-       size_t max_size, read = 0;
-       gpa_t gpa;
-       int ret;
-
-       if (!(baser & GITS_BASER_VALID))
-               return 0;
-
-       gpa = GITS_BASER_ADDR_48_to_52(baser);
-
-       max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-
-       while (read < max_size) {
-               ret = vgic_its_restore_cte(its, gpa, cte_esz);
-               if (ret <= 0)
-                       break;
-               gpa += cte_esz;
-               read += cte_esz;
-       }
-
-       if (ret > 0)
-               return 0;
-
-       return ret;
-}
-
-/**
- * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM
- * according to v0 ABI
- */
-static int vgic_its_save_tables_v0(struct vgic_its *its)
-{
-       int ret;
-
-       ret = vgic_its_save_device_tables(its);
-       if (ret)
-               return ret;
-
-       return vgic_its_save_collection_table(its);
-}
-
-/**
- * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM
- * to internal data structs according to V0 ABI
- *
- */
-static int vgic_its_restore_tables_v0(struct vgic_its *its)
-{
-       int ret;
-
-       ret = vgic_its_restore_collection_table(its);
-       if (ret)
-               return ret;
-
-       return vgic_its_restore_device_tables(its);
-}
-
-static int vgic_its_commit_v0(struct vgic_its *its)
-{
-       const struct vgic_its_abi *abi;
-
-       abi = vgic_its_get_abi(its);
-       its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
-       its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
-
-       its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5)
-                                       << GITS_BASER_ENTRY_SIZE_SHIFT);
-
-       its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5)
-                                       << GITS_BASER_ENTRY_SIZE_SHIFT);
-       return 0;
-}
-
-static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its)
-{
-       /* We need to keep the ABI specific field values */
-       its->baser_coll_table &= ~GITS_BASER_VALID;
-       its->baser_device_table &= ~GITS_BASER_VALID;
-       its->cbaser = 0;
-       its->creadr = 0;
-       its->cwriter = 0;
-       its->enabled = 0;
-       vgic_its_free_device_list(kvm, its);
-       vgic_its_free_collection_list(kvm, its);
-}
-
-static int vgic_its_has_attr(struct kvm_device *dev,
-                            struct kvm_device_attr *attr)
-{
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR:
-               switch (attr->attr) {
-               case KVM_VGIC_ITS_ADDR_TYPE:
-                       return 0;
-               }
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_CTRL:
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_CTRL_INIT:
-                       return 0;
-               case KVM_DEV_ARM_ITS_CTRL_RESET:
-                       return 0;
-               case KVM_DEV_ARM_ITS_SAVE_TABLES:
-                       return 0;
-               case KVM_DEV_ARM_ITS_RESTORE_TABLES:
-                       return 0;
-               }
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_ITS_REGS:
-               return vgic_its_has_attr_regs(dev, attr);
-       }
-       return -ENXIO;
-}
-
-static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
-{
-       const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-       int ret = 0;
-
-       if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
-               return 0;
-
-       mutex_lock(&kvm->lock);
-       mutex_lock(&its->its_lock);
-
-       if (!lock_all_vcpus(kvm)) {
-               mutex_unlock(&its->its_lock);
-               mutex_unlock(&kvm->lock);
-               return -EBUSY;
-       }
-
-       switch (attr) {
-       case KVM_DEV_ARM_ITS_CTRL_RESET:
-               vgic_its_reset(kvm, its);
-               break;
-       case KVM_DEV_ARM_ITS_SAVE_TABLES:
-               ret = abi->save_tables(its);
-               break;
-       case KVM_DEV_ARM_ITS_RESTORE_TABLES:
-               ret = abi->restore_tables(its);
-               break;
-       }
-
-       unlock_all_vcpus(kvm);
-       mutex_unlock(&its->its_lock);
-       mutex_unlock(&kvm->lock);
-       return ret;
-}
-
-static int vgic_its_set_attr(struct kvm_device *dev,
-                            struct kvm_device_attr *attr)
-{
-       struct vgic_its *its = dev->private;
-       int ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               unsigned long type = (unsigned long)attr->attr;
-               u64 addr;
-
-               if (type != KVM_VGIC_ITS_ADDR_TYPE)
-                       return -ENODEV;
-
-               if (copy_from_user(&addr, uaddr, sizeof(addr)))
-                       return -EFAULT;
-
-               ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
-                                       addr, SZ_64K);
-               if (ret)
-                       return ret;
-
-               return vgic_register_its_iodev(dev->kvm, its, addr);
-       }
-       case KVM_DEV_ARM_VGIC_GRP_CTRL:
-               return vgic_its_ctrl(dev->kvm, its, attr->attr);
-       case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               u64 reg;
-
-               if (get_user(reg, uaddr))
-                       return -EFAULT;
-
-               return vgic_its_attr_regs_access(dev, attr, &reg, true);
-       }
-       }
-       return -ENXIO;
-}
-
-static int vgic_its_get_attr(struct kvm_device *dev,
-                            struct kvm_device_attr *attr)
-{
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-               struct vgic_its *its = dev->private;
-               u64 addr = its->vgic_its_base;
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               unsigned long type = (unsigned long)attr->attr;
-
-               if (type != KVM_VGIC_ITS_ADDR_TYPE)
-                       return -ENODEV;
-
-               if (copy_to_user(uaddr, &addr, sizeof(addr)))
-                       return -EFAULT;
-               break;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               u64 reg;
-               int ret;
-
-               ret = vgic_its_attr_regs_access(dev, attr, &reg, false);
-               if (ret)
-                       return ret;
-               return put_user(reg, uaddr);
-       }
-       default:
-               return -ENXIO;
-       }
-
-       return 0;
-}
-
-static struct kvm_device_ops kvm_arm_vgic_its_ops = {
-       .name = "kvm-arm-vgic-its",
-       .create = vgic_its_create,
-       .destroy = vgic_its_destroy,
-       .set_attr = vgic_its_set_attr,
-       .get_attr = vgic_its_get_attr,
-       .has_attr = vgic_its_has_attr,
-};
-
-int kvm_vgic_register_its_device(void)
-{
-       return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
-                                      KVM_DEV_TYPE_ARM_VGIC_ITS);
-}
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
deleted file mode 100644 (file)
index 4441967..0000000
+++ /dev/null
@@ -1,741 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VGIC: KVM DEVICE API
- *
- * Copyright (C) 2015 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <linux/uaccess.h>
-#include <asm/kvm_mmu.h>
-#include <asm/cputype.h>
-#include "vgic.h"
-
-/* common helpers */
-
-int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
-                     phys_addr_t addr, phys_addr_t alignment)
-{
-       if (addr & ~kvm_phys_mask(kvm))
-               return -E2BIG;
-
-       if (!IS_ALIGNED(addr, alignment))
-               return -EINVAL;
-
-       if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
-               return -EEXIST;
-
-       return 0;
-}
-
-static int vgic_check_type(struct kvm *kvm, int type_needed)
-{
-       if (kvm->arch.vgic.vgic_model != type_needed)
-               return -ENODEV;
-       else
-               return 0;
-}
-
-/**
- * kvm_vgic_addr - set or get vgic VM base addresses
- * @kvm:   pointer to the vm struct
- * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
- * @addr:  pointer to address value
- * @write: if true set the address in the VM address space, if false read the
- *          address
- *
- * Set or get the vgic base addresses for the distributor and the virtual CPU
- * interface in the VM physical address space.  These addresses are properties
- * of the emulated core/SoC and therefore user space initially knows this
- * information.
- * Check them for sanity (alignment, double assignment). We can't check for
- * overlapping regions in case of a virtual GICv3 here, since we don't know
- * the number of VCPUs yet, so we defer this check to map_resources().
- */
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
-{
-       int r = 0;
-       struct vgic_dist *vgic = &kvm->arch.vgic;
-       phys_addr_t *addr_ptr, alignment;
-       u64 undef_value = VGIC_ADDR_UNDEF;
-
-       mutex_lock(&kvm->lock);
-       switch (type) {
-       case KVM_VGIC_V2_ADDR_TYPE_DIST:
-               r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
-               addr_ptr = &vgic->vgic_dist_base;
-               alignment = SZ_4K;
-               break;
-       case KVM_VGIC_V2_ADDR_TYPE_CPU:
-               r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
-               addr_ptr = &vgic->vgic_cpu_base;
-               alignment = SZ_4K;
-               break;
-       case KVM_VGIC_V3_ADDR_TYPE_DIST:
-               r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
-               addr_ptr = &vgic->vgic_dist_base;
-               alignment = SZ_64K;
-               break;
-       case KVM_VGIC_V3_ADDR_TYPE_REDIST: {
-               struct vgic_redist_region *rdreg;
-
-               r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
-               if (r)
-                       break;
-               if (write) {
-                       r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
-                       goto out;
-               }
-               rdreg = list_first_entry(&vgic->rd_regions,
-                                        struct vgic_redist_region, list);
-               if (!rdreg)
-                       addr_ptr = &undef_value;
-               else
-                       addr_ptr = &rdreg->base;
-               break;
-       }
-       case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
-       {
-               struct vgic_redist_region *rdreg;
-               u8 index;
-
-               r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
-               if (r)
-                       break;
-
-               index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK;
-
-               if (write) {
-                       gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK;
-                       u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK)
-                                       >> KVM_VGIC_V3_RDIST_COUNT_SHIFT;
-                       u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK)
-                                       >> KVM_VGIC_V3_RDIST_FLAGS_SHIFT;
-
-                       if (!count || flags)
-                               r = -EINVAL;
-                       else
-                               r = vgic_v3_set_redist_base(kvm, index,
-                                                           base, count);
-                       goto out;
-               }
-
-               rdreg = vgic_v3_rdist_region_from_index(kvm, index);
-               if (!rdreg) {
-                       r = -ENOENT;
-                       goto out;
-               }
-
-               *addr = index;
-               *addr |= rdreg->base;
-               *addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT;
-               goto out;
-       }
-       default:
-               r = -ENODEV;
-       }
-
-       if (r)
-               goto out;
-
-       if (write) {
-               r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment);
-               if (!r)
-                       *addr_ptr = *addr;
-       } else {
-               *addr = *addr_ptr;
-       }
-
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-static int vgic_set_common_attr(struct kvm_device *dev,
-                               struct kvm_device_attr *attr)
-{
-       int r;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               u64 addr;
-               unsigned long type = (unsigned long)attr->attr;
-
-               if (copy_from_user(&addr, uaddr, sizeof(addr)))
-                       return -EFAULT;
-
-               r = kvm_vgic_addr(dev->kvm, type, &addr, true);
-               return (r == -ENODEV) ? -ENXIO : r;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u32 val;
-               int ret = 0;
-
-               if (get_user(val, uaddr))
-                       return -EFAULT;
-
-               /*
-                * We require:
-                * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
-                * - at most 1024 interrupts
-                * - a multiple of 32 interrupts
-                */
-               if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
-                   val > VGIC_MAX_RESERVED ||
-                   (val & 31))
-                       return -EINVAL;
-
-               mutex_lock(&dev->kvm->lock);
-
-               if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis)
-                       ret = -EBUSY;
-               else
-                       dev->kvm->arch.vgic.nr_spis =
-                               val - VGIC_NR_PRIVATE_IRQS;
-
-               mutex_unlock(&dev->kvm->lock);
-
-               return ret;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_CTRL: {
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_CTRL_INIT:
-                       mutex_lock(&dev->kvm->lock);
-                       r = vgic_init(dev->kvm);
-                       mutex_unlock(&dev->kvm->lock);
-                       return r;
-               }
-               break;
-       }
-       }
-
-       return -ENXIO;
-}
-
-static int vgic_get_common_attr(struct kvm_device *dev,
-                               struct kvm_device_attr *attr)
-{
-       int r = -ENXIO;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               u64 addr;
-               unsigned long type = (unsigned long)attr->attr;
-
-               r = kvm_vgic_addr(dev->kvm, type, &addr, false);
-               if (r)
-                       return (r == -ENODEV) ? -ENXIO : r;
-
-               if (copy_to_user(uaddr, &addr, sizeof(addr)))
-                       return -EFAULT;
-               break;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-
-               r = put_user(dev->kvm->arch.vgic.nr_spis +
-                            VGIC_NR_PRIVATE_IRQS, uaddr);
-               break;
-       }
-       }
-
-       return r;
-}
-
-static int vgic_create(struct kvm_device *dev, u32 type)
-{
-       return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_destroy(struct kvm_device *dev)
-{
-       kfree(dev);
-}
-
-int kvm_register_vgic_device(unsigned long type)
-{
-       int ret = -ENODEV;
-
-       switch (type) {
-       case KVM_DEV_TYPE_ARM_VGIC_V2:
-               ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-                                             KVM_DEV_TYPE_ARM_VGIC_V2);
-               break;
-       case KVM_DEV_TYPE_ARM_VGIC_V3:
-               ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
-                                             KVM_DEV_TYPE_ARM_VGIC_V3);
-
-               if (ret)
-                       break;
-               ret = kvm_vgic_register_its_device();
-               break;
-       }
-
-       return ret;
-}
-
-int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-                      struct vgic_reg_attr *reg_attr)
-{
-       int cpuid;
-
-       cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
-                KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-
-       if (cpuid >= atomic_read(&dev->kvm->online_vcpus))
-               return -EINVAL;
-
-       reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid);
-       reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-
-       return 0;
-}
-
-/* unlocks vcpus from @vcpu_lock_idx and smaller */
-static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
-{
-       struct kvm_vcpu *tmp_vcpu;
-
-       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
-               tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
-               mutex_unlock(&tmp_vcpu->mutex);
-       }
-}
-
-void unlock_all_vcpus(struct kvm *kvm)
-{
-       unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
-}
-
-/* Returns true if all vcpus were locked, false otherwise */
-bool lock_all_vcpus(struct kvm *kvm)
-{
-       struct kvm_vcpu *tmp_vcpu;
-       int c;
-
-       /*
-        * Any time a vcpu is run, vcpu_load is called which tries to grab the
-        * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
-        * that no other VCPUs are run and fiddle with the vgic state while we
-        * access it.
-        */
-       kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
-               if (!mutex_trylock(&tmp_vcpu->mutex)) {
-                       unlock_vcpus(kvm, c - 1);
-                       return false;
-               }
-       }
-
-       return true;
-}
-
-/**
- * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state
- *
- * @dev:      kvm device handle
- * @attr:     kvm device attribute
- * @reg:      address the value is read or written
- * @is_write: true if userspace is writing a register
- */
-static int vgic_v2_attr_regs_access(struct kvm_device *dev,
-                                   struct kvm_device_attr *attr,
-                                   u32 *reg, bool is_write)
-{
-       struct vgic_reg_attr reg_attr;
-       gpa_t addr;
-       struct kvm_vcpu *vcpu;
-       int ret;
-
-       ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
-       if (ret)
-               return ret;
-
-       vcpu = reg_attr.vcpu;
-       addr = reg_attr.addr;
-
-       mutex_lock(&dev->kvm->lock);
-
-       ret = vgic_init(dev->kvm);
-       if (ret)
-               goto out;
-
-       if (!lock_all_vcpus(dev->kvm)) {
-               ret = -EBUSY;
-               goto out;
-       }
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg);
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-               ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg);
-               break;
-       default:
-               ret = -EINVAL;
-               break;
-       }
-
-       unlock_all_vcpus(dev->kvm);
-out:
-       mutex_unlock(&dev->kvm->lock);
-       return ret;
-}
-
-static int vgic_v2_set_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_set_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u32 reg;
-
-               if (get_user(reg, uaddr))
-                       return -EFAULT;
-
-               return vgic_v2_attr_regs_access(dev, attr, &reg, true);
-       }
-       }
-
-       return -ENXIO;
-}
-
-static int vgic_v2_get_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_get_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u32 reg = 0;
-
-               ret = vgic_v2_attr_regs_access(dev, attr, &reg, false);
-               if (ret)
-                       return ret;
-               return put_user(reg, uaddr);
-       }
-       }
-
-       return -ENXIO;
-}
-
-static int vgic_v2_has_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR:
-               switch (attr->attr) {
-               case KVM_VGIC_V2_ADDR_TYPE_DIST:
-               case KVM_VGIC_V2_ADDR_TYPE_CPU:
-                       return 0;
-               }
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               return vgic_v2_has_attr_regs(dev, attr);
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-               return 0;
-       case KVM_DEV_ARM_VGIC_GRP_CTRL:
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_CTRL_INIT:
-                       return 0;
-               }
-       }
-       return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v2_ops = {
-       .name = "kvm-arm-vgic-v2",
-       .create = vgic_create,
-       .destroy = vgic_destroy,
-       .set_attr = vgic_v2_set_attr,
-       .get_attr = vgic_v2_get_attr,
-       .has_attr = vgic_v2_has_attr,
-};
-
-int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-                      struct vgic_reg_attr *reg_attr)
-{
-       unsigned long vgic_mpidr, mpidr_reg;
-
-       /*
-        * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group,
-        * attr might not hold MPIDR. Hence assume vcpu0.
-        */
-       if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) {
-               vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >>
-                             KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT;
-
-               mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr);
-               reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg);
-       } else {
-               reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0);
-       }
-
-       if (!reg_attr->vcpu)
-               return -EINVAL;
-
-       reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-
-       return 0;
-}
-
-/*
- * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state
- *
- * @dev:      kvm device handle
- * @attr:     kvm device attribute
- * @reg:      address the value is read or written
- * @is_write: true if userspace is writing a register
- */
-static int vgic_v3_attr_regs_access(struct kvm_device *dev,
-                                   struct kvm_device_attr *attr,
-                                   u64 *reg, bool is_write)
-{
-       struct vgic_reg_attr reg_attr;
-       gpa_t addr;
-       struct kvm_vcpu *vcpu;
-       int ret;
-       u32 tmp32;
-
-       ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
-       if (ret)
-               return ret;
-
-       vcpu = reg_attr.vcpu;
-       addr = reg_attr.addr;
-
-       mutex_lock(&dev->kvm->lock);
-
-       if (unlikely(!vgic_initialized(dev->kvm))) {
-               ret = -EBUSY;
-               goto out;
-       }
-
-       if (!lock_all_vcpus(dev->kvm)) {
-               ret = -EBUSY;
-               goto out;
-       }
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-               if (is_write)
-                       tmp32 = *reg;
-
-               ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32);
-               if (!is_write)
-                       *reg = tmp32;
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
-               if (is_write)
-                       tmp32 = *reg;
-
-               ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32);
-               if (!is_write)
-                       *reg = tmp32;
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-               u64 regid;
-
-               regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
-               ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write,
-                                                 regid, reg);
-               break;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-               unsigned int info, intid;
-
-               info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
-                       KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT;
-               if (info == VGIC_LEVEL_INFO_LINE_LEVEL) {
-                       intid = attr->attr &
-                               KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK;
-                       ret = vgic_v3_line_level_info_uaccess(vcpu, is_write,
-                                                             intid, reg);
-               } else {
-                       ret = -EINVAL;
-               }
-               break;
-       }
-       default:
-               ret = -EINVAL;
-               break;
-       }
-
-       unlock_all_vcpus(dev->kvm);
-out:
-       mutex_unlock(&dev->kvm->lock);
-       return ret;
-}
-
-static int vgic_v3_set_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_set_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u32 tmp32;
-               u64 reg;
-
-               if (get_user(tmp32, uaddr))
-                       return -EFAULT;
-
-               reg = tmp32;
-               return vgic_v3_attr_regs_access(dev, attr, &reg, true);
-       }
-       case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               u64 reg;
-
-               if (get_user(reg, uaddr))
-                       return -EFAULT;
-
-               return vgic_v3_attr_regs_access(dev, attr, &reg, true);
-       }
-       case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u64 reg;
-               u32 tmp32;
-
-               if (get_user(tmp32, uaddr))
-                       return -EFAULT;
-
-               reg = tmp32;
-               return vgic_v3_attr_regs_access(dev, attr, &reg, true);
-       }
-       case KVM_DEV_ARM_VGIC_GRP_CTRL: {
-               int ret;
-
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
-                       mutex_lock(&dev->kvm->lock);
-
-                       if (!lock_all_vcpus(dev->kvm)) {
-                               mutex_unlock(&dev->kvm->lock);
-                               return -EBUSY;
-                       }
-                       ret = vgic_v3_save_pending_tables(dev->kvm);
-                       unlock_all_vcpus(dev->kvm);
-                       mutex_unlock(&dev->kvm->lock);
-                       return ret;
-               }
-               break;
-       }
-       }
-       return -ENXIO;
-}
-
-static int vgic_v3_get_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_get_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u64 reg;
-               u32 tmp32;
-
-               ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
-               if (ret)
-                       return ret;
-               tmp32 = reg;
-               return put_user(tmp32, uaddr);
-       }
-       case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               u64 reg;
-
-               ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
-               if (ret)
-                       return ret;
-               return put_user(reg, uaddr);
-       }
-       case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u64 reg;
-               u32 tmp32;
-
-               ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
-               if (ret)
-                       return ret;
-               tmp32 = reg;
-               return put_user(tmp32, uaddr);
-       }
-       }
-       return -ENXIO;
-}
-
-static int vgic_v3_has_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR:
-               switch (attr->attr) {
-               case KVM_VGIC_V3_ADDR_TYPE_DIST:
-               case KVM_VGIC_V3_ADDR_TYPE_REDIST:
-               case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
-                       return 0;
-               }
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
-               return vgic_v3_has_attr_regs(dev, attr);
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-               return 0;
-       case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-               if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
-                     KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) ==
-                     VGIC_LEVEL_INFO_LINE_LEVEL)
-                       return 0;
-               break;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_CTRL:
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_CTRL_INIT:
-                       return 0;
-               case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
-                       return 0;
-               }
-       }
-       return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v3_ops = {
-       .name = "kvm-arm-vgic-v3",
-       .create = vgic_create,
-       .destroy = vgic_destroy,
-       .set_attr = vgic_v3_set_attr,
-       .get_attr = vgic_v3_get_attr,
-       .has_attr = vgic_v3_has_attr,
-};
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
deleted file mode 100644 (file)
index a016f07..0000000
+++ /dev/null
@@ -1,550 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VGICv2 MMIO handling functions
- */
-
-#include <linux/irqchip/arm-gic.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/nospec.h>
-
-#include <kvm/iodev.h>
-#include <kvm/arm_vgic.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-/*
- * The Revision field in the IIDR have the following meanings:
- *
- * Revision 1: Report GICv2 interrupts as group 0 instead of group 1
- * Revision 2: Interrupt groups are guest-configurable and signaled using
- *            their configured groups.
- */
-
-static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu,
-                                           gpa_t addr, unsigned int len)
-{
-       struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
-       u32 value;
-
-       switch (addr & 0x0c) {
-       case GIC_DIST_CTRL:
-               value = vgic->enabled ? GICD_ENABLE : 0;
-               break;
-       case GIC_DIST_CTR:
-               value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
-               value = (value >> 5) - 1;
-               value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
-               break;
-       case GIC_DIST_IIDR:
-               value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
-                       (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
-                       (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
-               break;
-       default:
-               return 0;
-       }
-
-       return value;
-}
-
-static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len,
-                                   unsigned long val)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       bool was_enabled = dist->enabled;
-
-       switch (addr & 0x0c) {
-       case GIC_DIST_CTRL:
-               dist->enabled = val & GICD_ENABLE;
-               if (!was_enabled && dist->enabled)
-                       vgic_kick_vcpus(vcpu->kvm);
-               break;
-       case GIC_DIST_CTR:
-       case GIC_DIST_IIDR:
-               /* Nothing to do */
-               return;
-       }
-}
-
-static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu,
-                                          gpa_t addr, unsigned int len,
-                                          unsigned long val)
-{
-       switch (addr & 0x0c) {
-       case GIC_DIST_IIDR:
-               if (val != vgic_mmio_read_v2_misc(vcpu, addr, len))
-                       return -EINVAL;
-
-               /*
-                * If we observe a write to GICD_IIDR we know that userspace
-                * has been updated and has had a chance to cope with older
-                * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting
-                * interrupts as group 1, and therefore we now allow groups to
-                * be user writable.  Doing this by default would break
-                * migration from old kernels to new kernels with legacy
-                * userspace.
-                */
-               vcpu->kvm->arch.vgic.v2_groups_user_writable = true;
-               return 0;
-       }
-
-       vgic_mmio_write_v2_misc(vcpu, addr, len, val);
-       return 0;
-}
-
-static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu,
-                                           gpa_t addr, unsigned int len,
-                                           unsigned long val)
-{
-       if (vcpu->kvm->arch.vgic.v2_groups_user_writable)
-               vgic_mmio_write_group(vcpu, addr, len, val);
-
-       return 0;
-}
-
-static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
-                                gpa_t addr, unsigned int len,
-                                unsigned long val)
-{
-       int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus);
-       int intid = val & 0xf;
-       int targets = (val >> 16) & 0xff;
-       int mode = (val >> 24) & 0x03;
-       int c;
-       struct kvm_vcpu *vcpu;
-       unsigned long flags;
-
-       switch (mode) {
-       case 0x0:               /* as specified by targets */
-               break;
-       case 0x1:
-               targets = (1U << nr_vcpus) - 1;                 /* all, ... */
-               targets &= ~(1U << source_vcpu->vcpu_id);       /* but self */
-               break;
-       case 0x2:               /* this very vCPU only */
-               targets = (1U << source_vcpu->vcpu_id);
-               break;
-       case 0x3:               /* reserved */
-               return;
-       }
-
-       kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) {
-               struct vgic_irq *irq;
-
-               if (!(targets & (1U << c)))
-                       continue;
-
-               irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               irq->pending_latch = true;
-               irq->source |= 1U << source_vcpu->vcpu_id;
-
-               vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags);
-               vgic_put_irq(source_vcpu->kvm, irq);
-       }
-}
-
-static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
-                                          gpa_t addr, unsigned int len)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-       int i;
-       u64 val = 0;
-
-       for (i = 0; i < len; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               val |= (u64)irq->targets << (i * 8);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return val;
-}
-
-static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
-                                  gpa_t addr, unsigned int len,
-                                  unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-       u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0);
-       int i;
-       unsigned long flags;
-
-       /* GICD_ITARGETSR[0-7] are read-only */
-       if (intid < VGIC_NR_PRIVATE_IRQS)
-               return;
-
-       for (i = 0; i < len; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
-               int target;
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-               irq->targets = (val >> (i * 8)) & cpu_mask;
-               target = irq->targets ? __ffs(irq->targets) : 0;
-               irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
-
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
-                                           gpa_t addr, unsigned int len)
-{
-       u32 intid = addr & 0x0f;
-       int i;
-       u64 val = 0;
-
-       for (i = 0; i < len; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               val |= (u64)irq->source << (i * 8);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-       return val;
-}
-
-static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
-                                    gpa_t addr, unsigned int len,
-                                    unsigned long val)
-{
-       u32 intid = addr & 0x0f;
-       int i;
-       unsigned long flags;
-
-       for (i = 0; i < len; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-               irq->source &= ~((val >> (i * 8)) & 0xff);
-               if (!irq->source)
-                       irq->pending_latch = false;
-
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
-                                    gpa_t addr, unsigned int len,
-                                    unsigned long val)
-{
-       u32 intid = addr & 0x0f;
-       int i;
-       unsigned long flags;
-
-       for (i = 0; i < len; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-               irq->source |= (val >> (i * 8)) & 0xff;
-
-               if (irq->source) {
-                       irq->pending_latch = true;
-                       vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-               } else {
-                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               }
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-#define GICC_ARCH_VERSION_V2   0x2
-
-/* These are for userland accesses only, there is no guest-facing emulation. */
-static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
-                                          gpa_t addr, unsigned int len)
-{
-       struct vgic_vmcr vmcr;
-       u32 val;
-
-       vgic_get_vmcr(vcpu, &vmcr);
-
-       switch (addr & 0xff) {
-       case GIC_CPU_CTRL:
-               val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT;
-               val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT;
-               val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT;
-               val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT;
-               val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT;
-               val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT;
-
-               break;
-       case GIC_CPU_PRIMASK:
-               /*
-                * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
-                * the PMR field as GICH_VMCR.VMPriMask rather than
-                * GICC_PMR.Priority, so we expose the upper five bits of
-                * priority mask to userspace using the lower bits in the
-                * unsigned long.
-                */
-               val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >>
-                       GICV_PMR_PRIORITY_SHIFT;
-               break;
-       case GIC_CPU_BINPOINT:
-               val = vmcr.bpr;
-               break;
-       case GIC_CPU_ALIAS_BINPOINT:
-               val = vmcr.abpr;
-               break;
-       case GIC_CPU_IDENT:
-               val = ((PRODUCT_ID_KVM << 20) |
-                      (GICC_ARCH_VERSION_V2 << 16) |
-                      IMPLEMENTER_ARM);
-               break;
-       default:
-               return 0;
-       }
-
-       return val;
-}
-
-static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
-                                  gpa_t addr, unsigned int len,
-                                  unsigned long val)
-{
-       struct vgic_vmcr vmcr;
-
-       vgic_get_vmcr(vcpu, &vmcr);
-
-       switch (addr & 0xff) {
-       case GIC_CPU_CTRL:
-               vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0);
-               vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1);
-               vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl);
-               vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn);
-               vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR);
-               vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS);
-
-               break;
-       case GIC_CPU_PRIMASK:
-               /*
-                * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
-                * the PMR field as GICH_VMCR.VMPriMask rather than
-                * GICC_PMR.Priority, so we expose the upper five bits of
-                * priority mask to userspace using the lower bits in the
-                * unsigned long.
-                */
-               vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) &
-                       GICV_PMR_PRIORITY_MASK;
-               break;
-       case GIC_CPU_BINPOINT:
-               vmcr.bpr = val;
-               break;
-       case GIC_CPU_ALIAS_BINPOINT:
-               vmcr.abpr = val;
-               break;
-       }
-
-       vgic_set_vmcr(vcpu, &vmcr);
-}
-
-static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu,
-                                       gpa_t addr, unsigned int len)
-{
-       int n; /* which APRn is this */
-
-       n = (addr >> 2) & 0x3;
-
-       if (kvm_vgic_global_state.type == VGIC_V2) {
-               /* GICv2 hardware systems support max. 32 groups */
-               if (n != 0)
-                       return 0;
-               return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr;
-       } else {
-               struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-               if (n > vgic_v3_max_apr_idx(vcpu))
-                       return 0;
-
-               n = array_index_nospec(n, 4);
-
-               /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
-               return vgicv3->vgic_ap1r[n];
-       }
-}
-
-static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu,
-                               gpa_t addr, unsigned int len,
-                               unsigned long val)
-{
-       int n; /* which APRn is this */
-
-       n = (addr >> 2) & 0x3;
-
-       if (kvm_vgic_global_state.type == VGIC_V2) {
-               /* GICv2 hardware systems support max. 32 groups */
-               if (n != 0)
-                       return;
-               vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val;
-       } else {
-               struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-               if (n > vgic_v3_max_apr_idx(vcpu))
-                       return;
-
-               n = array_index_nospec(n, 4);
-
-               /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
-               vgicv3->vgic_ap1r[n] = val;
-       }
-}
-
-static const struct vgic_register_region vgic_v2_dist_registers[] = {
-       REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL,
-               vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc,
-               NULL, vgic_mmio_uaccess_write_v2_misc,
-               12, VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
-               vgic_mmio_read_group, vgic_mmio_write_group,
-               NULL, vgic_mmio_uaccess_write_v2_group, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET,
-               vgic_mmio_read_enable, vgic_mmio_write_senable,
-               NULL, vgic_uaccess_write_senable, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR,
-               vgic_mmio_read_enable, vgic_mmio_write_cenable,
-               NULL, vgic_uaccess_write_cenable, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
-               vgic_mmio_read_pending, vgic_mmio_write_spending,
-               NULL, vgic_uaccess_write_spending, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
-               vgic_mmio_read_pending, vgic_mmio_write_cpending,
-               NULL, vgic_uaccess_write_cpending, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
-               vgic_mmio_read_active, vgic_mmio_write_sactive,
-               vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
-               vgic_mmio_read_active, vgic_mmio_write_cactive,
-               vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
-               vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
-               8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET,
-               vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8,
-               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG,
-               vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT,
-               vgic_mmio_read_raz, vgic_mmio_write_sgir, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR,
-               vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16,
-               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET,
-               vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16,
-               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-};
-
-static const struct vgic_register_region vgic_v2_cpu_registers[] = {
-       REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL,
-               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK,
-               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT,
-               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT,
-               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO,
-               vgic_mmio_read_apr, vgic_mmio_write_apr, 16,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
-               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-               VGIC_ACCESS_32bit),
-};
-
-unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
-{
-       dev->regions = vgic_v2_dist_registers;
-       dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
-
-       kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
-
-       return SZ_4K;
-}
-
-int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-       const struct vgic_register_region *region;
-       struct vgic_io_device iodev;
-       struct vgic_reg_attr reg_attr;
-       struct kvm_vcpu *vcpu;
-       gpa_t addr;
-       int ret;
-
-       ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
-       if (ret)
-               return ret;
-
-       vcpu = reg_attr.vcpu;
-       addr = reg_attr.addr;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-               iodev.regions = vgic_v2_dist_registers;
-               iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
-               iodev.base_addr = 0;
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               iodev.regions = vgic_v2_cpu_registers;
-               iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
-               iodev.base_addr = 0;
-               break;
-       default:
-               return -ENXIO;
-       }
-
-       /* We only support aligned 32-bit accesses. */
-       if (addr & 3)
-               return -ENXIO;
-
-       region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
-       if (!region)
-               return -ENXIO;
-
-       return 0;
-}
-
-int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-                         int offset, u32 *val)
-{
-       struct vgic_io_device dev = {
-               .regions = vgic_v2_cpu_registers,
-               .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
-               .iodev_type = IODEV_CPUIF,
-       };
-
-       return vgic_uaccess(vcpu, &dev, is_write, offset, val);
-}
-
-int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-                        int offset, u32 *val)
-{
-       struct vgic_io_device dev = {
-               .regions = vgic_v2_dist_registers,
-               .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
-               .iodev_type = IODEV_DIST,
-       };
-
-       return vgic_uaccess(vcpu, &dev, is_write, offset, val);
-}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
deleted file mode 100644 (file)
index 89a14ec..0000000
+++ /dev/null
@@ -1,1063 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VGICv3 MMIO handling functions
- */
-
-#include <linux/bitfield.h>
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <kvm/iodev.h>
-#include <kvm/arm_vgic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-/* extract @num bytes at @offset bytes offset in data */
-unsigned long extract_bytes(u64 data, unsigned int offset,
-                           unsigned int num)
-{
-       return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
-}
-
-/* allows updates of any half of a 64-bit register (or the whole thing) */
-u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
-                    unsigned long val)
-{
-       int lower = (offset & 4) * 8;
-       int upper = lower + 8 * len - 1;
-
-       reg &= ~GENMASK_ULL(upper, lower);
-       val &= GENMASK_ULL(len * 8 - 1, 0);
-
-       return reg | ((u64)val << lower);
-}
-
-bool vgic_has_its(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
-               return false;
-
-       return dist->has_its;
-}
-
-bool vgic_supports_direct_msis(struct kvm *kvm)
-{
-       return (kvm_vgic_global_state.has_gicv4_1 ||
-               (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm)));
-}
-
-/*
- * The Revision field in the IIDR have the following meanings:
- *
- * Revision 2: Interrupt groups are guest-configurable and signaled using
- *            their configured groups.
- */
-
-static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
-                                           gpa_t addr, unsigned int len)
-{
-       struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
-       u32 value = 0;
-
-       switch (addr & 0x0c) {
-       case GICD_CTLR:
-               if (vgic->enabled)
-                       value |= GICD_CTLR_ENABLE_SS_G1;
-               value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
-               if (vgic->nassgireq)
-                       value |= GICD_CTLR_nASSGIreq;
-               break;
-       case GICD_TYPER:
-               value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
-               value = (value >> 5) - 1;
-               if (vgic_has_its(vcpu->kvm)) {
-                       value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
-                       value |= GICD_TYPER_LPIS;
-               } else {
-                       value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
-               }
-               break;
-       case GICD_TYPER2:
-               if (kvm_vgic_global_state.has_gicv4_1)
-                       value = GICD_TYPER2_nASSGIcap;
-               break;
-       case GICD_IIDR:
-               value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
-                       (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
-                       (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
-               break;
-       default:
-               return 0;
-       }
-
-       return value;
-}
-
-static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len,
-                                   unsigned long val)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       switch (addr & 0x0c) {
-       case GICD_CTLR: {
-               bool was_enabled, is_hwsgi;
-
-               mutex_lock(&vcpu->kvm->lock);
-
-               was_enabled = dist->enabled;
-               is_hwsgi = dist->nassgireq;
-
-               dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
-
-               /* Not a GICv4.1? No HW SGIs */
-               if (!kvm_vgic_global_state.has_gicv4_1)
-                       val &= ~GICD_CTLR_nASSGIreq;
-
-               /* Dist stays enabled? nASSGIreq is RO */
-               if (was_enabled && dist->enabled) {
-                       val &= ~GICD_CTLR_nASSGIreq;
-                       val |= FIELD_PREP(GICD_CTLR_nASSGIreq, is_hwsgi);
-               }
-
-               /* Switching HW SGIs? */
-               dist->nassgireq = val & GICD_CTLR_nASSGIreq;
-               if (is_hwsgi != dist->nassgireq)
-                       vgic_v4_configure_vsgis(vcpu->kvm);
-
-               if (kvm_vgic_global_state.has_gicv4_1 &&
-                   was_enabled != dist->enabled)
-                       kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4);
-               else if (!was_enabled && dist->enabled)
-                       vgic_kick_vcpus(vcpu->kvm);
-
-               mutex_unlock(&vcpu->kvm->lock);
-               break;
-       }
-       case GICD_TYPER:
-       case GICD_TYPER2:
-       case GICD_IIDR:
-               /* This is at best for documentation purposes... */
-               return;
-       }
-}
-
-static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
-                                          gpa_t addr, unsigned int len,
-                                          unsigned long val)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       switch (addr & 0x0c) {
-       case GICD_TYPER2:
-       case GICD_IIDR:
-               if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
-                       return -EINVAL;
-               return 0;
-       case GICD_CTLR:
-               /* Not a GICv4.1? No HW SGIs */
-               if (!kvm_vgic_global_state.has_gicv4_1)
-                       val &= ~GICD_CTLR_nASSGIreq;
-
-               dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
-               dist->nassgireq = val & GICD_CTLR_nASSGIreq;
-               return 0;
-       }
-
-       vgic_mmio_write_v3_misc(vcpu, addr, len, val);
-       return 0;
-}
-
-static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
-                                           gpa_t addr, unsigned int len)
-{
-       int intid = VGIC_ADDR_TO_INTID(addr, 64);
-       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
-       unsigned long ret = 0;
-
-       if (!irq)
-               return 0;
-
-       /* The upper word is RAZ for us. */
-       if (!(addr & 4))
-               ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
-
-       vgic_put_irq(vcpu->kvm, irq);
-       return ret;
-}
-
-static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len,
-                                   unsigned long val)
-{
-       int intid = VGIC_ADDR_TO_INTID(addr, 64);
-       struct vgic_irq *irq;
-       unsigned long flags;
-
-       /* The upper word is WI for us since we don't implement Aff3. */
-       if (addr & 4)
-               return;
-
-       irq = vgic_get_irq(vcpu->kvm, NULL, intid);
-
-       if (!irq)
-               return;
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-       /* We only care about and preserve Aff0, Aff1 and Aff2. */
-       irq->mpidr = val & GENMASK(23, 0);
-       irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
-
-       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-       vgic_put_irq(vcpu->kvm, irq);
-}
-
-static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
-                                            gpa_t addr, unsigned int len)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-       return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
-}
-
-
-static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
-                                    gpa_t addr, unsigned int len,
-                                    unsigned long val)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       bool was_enabled = vgic_cpu->lpis_enabled;
-
-       if (!vgic_has_its(vcpu->kvm))
-               return;
-
-       vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
-
-       if (was_enabled && !vgic_cpu->lpis_enabled) {
-               vgic_flush_pending_lpis(vcpu);
-               vgic_its_invalidate_cache(vcpu->kvm);
-       }
-
-       if (!was_enabled && vgic_cpu->lpis_enabled)
-               vgic_enable_lpis(vcpu);
-}
-
-static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
-                                             gpa_t addr, unsigned int len)
-{
-       unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
-       int target_vcpu_id = vcpu->vcpu_id;
-       gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
-                       (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;
-       u64 value;
-
-       value = (u64)(mpidr & GENMASK(23, 0)) << 32;
-       value |= ((target_vcpu_id & 0xffff) << 8);
-
-       if (addr == last_rdist_typer)
-               value |= GICR_TYPER_LAST;
-       if (vgic_has_its(vcpu->kvm))
-               value |= GICR_TYPER_PLPIS;
-
-       return extract_bytes(value, addr & 7, len);
-}
-
-static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu,
-                                            gpa_t addr, unsigned int len)
-{
-       return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
-}
-
-static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
-                                             gpa_t addr, unsigned int len)
-{
-       switch (addr & 0xffff) {
-       case GICD_PIDR2:
-               /* report a GICv3 compliant implementation */
-               return 0x3b;
-       }
-
-       return 0;
-}
-
-static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu,
-                                                 gpa_t addr, unsigned int len)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       u32 value = 0;
-       int i;
-
-       /*
-        * pending state of interrupt is latched in pending_latch variable.
-        * Userspace will save and restore pending state and line_level
-        * separately.
-        * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt
-        * for handling of ISPENDR and ICPENDR.
-        */
-       for (i = 0; i < len * 8; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-               bool state = irq->pending_latch;
-
-               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-                       int err;
-
-                       err = irq_get_irqchip_state(irq->host_irq,
-                                                   IRQCHIP_STATE_PENDING,
-                                                   &state);
-                       WARN_ON(err);
-               }
-
-               if (state)
-                       value |= (1U << i);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return value;
-}
-
-static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
-                                        gpa_t addr, unsigned int len,
-                                        unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-       unsigned long flags;
-
-       for (i = 0; i < len * 8; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               if (test_bit(i, &val)) {
-                       /*
-                        * pending_latch is set irrespective of irq type
-                        * (level or edge) to avoid dependency that VM should
-                        * restore irq config before pending info.
-                        */
-                       irq->pending_latch = true;
-                       vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-               } else {
-                       irq->pending_latch = false;
-                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               }
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return 0;
-}
-
-/* We want to avoid outer shareable. */
-u64 vgic_sanitise_shareability(u64 field)
-{
-       switch (field) {
-       case GIC_BASER_OuterShareable:
-               return GIC_BASER_InnerShareable;
-       default:
-               return field;
-       }
-}
-
-/* Avoid any inner non-cacheable mapping. */
-u64 vgic_sanitise_inner_cacheability(u64 field)
-{
-       switch (field) {
-       case GIC_BASER_CACHE_nCnB:
-       case GIC_BASER_CACHE_nC:
-               return GIC_BASER_CACHE_RaWb;
-       default:
-               return field;
-       }
-}
-
-/* Non-cacheable or same-as-inner are OK. */
-u64 vgic_sanitise_outer_cacheability(u64 field)
-{
-       switch (field) {
-       case GIC_BASER_CACHE_SameAsInner:
-       case GIC_BASER_CACHE_nC:
-               return field;
-       default:
-               return GIC_BASER_CACHE_nC;
-       }
-}
-
-u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
-                       u64 (*sanitise_fn)(u64))
-{
-       u64 field = (reg & field_mask) >> field_shift;
-
-       field = sanitise_fn(field) << field_shift;
-       return (reg & ~field_mask) | field;
-}
-
-#define PROPBASER_RES0_MASK                                            \
-       (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
-#define PENDBASER_RES0_MASK                                            \
-       (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) |      \
-        GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))
-
-static u64 vgic_sanitise_pendbaser(u64 reg)
-{
-       reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
-                                 GICR_PENDBASER_SHAREABILITY_SHIFT,
-                                 vgic_sanitise_shareability);
-       reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
-                                 GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
-                                 vgic_sanitise_inner_cacheability);
-       reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
-                                 GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
-                                 vgic_sanitise_outer_cacheability);
-
-       reg &= ~PENDBASER_RES0_MASK;
-
-       return reg;
-}
-
-static u64 vgic_sanitise_propbaser(u64 reg)
-{
-       reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
-                                 GICR_PROPBASER_SHAREABILITY_SHIFT,
-                                 vgic_sanitise_shareability);
-       reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
-                                 GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
-                                 vgic_sanitise_inner_cacheability);
-       reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
-                                 GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
-                                 vgic_sanitise_outer_cacheability);
-
-       reg &= ~PROPBASER_RES0_MASK;
-       return reg;
-}
-
-static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
-                                            gpa_t addr, unsigned int len)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return extract_bytes(dist->propbaser, addr & 7, len);
-}
-
-static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
-                                    gpa_t addr, unsigned int len,
-                                    unsigned long val)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       u64 old_propbaser, propbaser;
-
-       /* Storing a value with LPIs already enabled is undefined */
-       if (vgic_cpu->lpis_enabled)
-               return;
-
-       do {
-               old_propbaser = READ_ONCE(dist->propbaser);
-               propbaser = old_propbaser;
-               propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
-               propbaser = vgic_sanitise_propbaser(propbaser);
-       } while (cmpxchg64(&dist->propbaser, old_propbaser,
-                          propbaser) != old_propbaser);
-}
-
-static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
-                                            gpa_t addr, unsigned int len)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       u64 value = vgic_cpu->pendbaser;
-
-       value &= ~GICR_PENDBASER_PTZ;
-
-       return extract_bytes(value, addr & 7, len);
-}
-
-static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
-                                    gpa_t addr, unsigned int len,
-                                    unsigned long val)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       u64 old_pendbaser, pendbaser;
-
-       /* Storing a value with LPIs already enabled is undefined */
-       if (vgic_cpu->lpis_enabled)
-               return;
-
-       do {
-               old_pendbaser = READ_ONCE(vgic_cpu->pendbaser);
-               pendbaser = old_pendbaser;
-               pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
-               pendbaser = vgic_sanitise_pendbaser(pendbaser);
-       } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser,
-                          pendbaser) != old_pendbaser);
-}
-
-/*
- * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
- * redistributors, while SPIs are covered by registers in the distributor
- * block. Trying to set private IRQs in this block gets ignored.
- * We take some special care here to fix the calculation of the register
- * offset.
- */
-#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \
-       {                                                               \
-               .reg_offset = off,                                      \
-               .bits_per_irq = bpi,                                    \
-               .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8,                \
-               .access_flags = acc,                                    \
-               .read = vgic_mmio_read_raz,                             \
-               .write = vgic_mmio_write_wi,                            \
-       }, {                                                            \
-               .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8,   \
-               .bits_per_irq = bpi,                                    \
-               .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8,       \
-               .access_flags = acc,                                    \
-               .read = rd,                                             \
-               .write = wr,                                            \
-               .uaccess_read = ur,                                     \
-               .uaccess_write = uw,                                    \
-       }
-
-static const struct vgic_register_region vgic_v3_dist_registers[] = {
-       REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR,
-               vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc,
-               NULL, vgic_mmio_uaccess_write_v3_misc,
-               16, VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GICD_STATUSR,
-               vgic_mmio_read_rao, vgic_mmio_write_wi, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR,
-               vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER,
-               vgic_mmio_read_enable, vgic_mmio_write_senable,
-               NULL, vgic_uaccess_write_senable, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER,
-               vgic_mmio_read_enable, vgic_mmio_write_cenable,
-              NULL, vgic_uaccess_write_cenable, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
-               vgic_mmio_read_pending, vgic_mmio_write_spending,
-               vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
-               vgic_mmio_read_pending, vgic_mmio_write_cpending,
-               vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
-               vgic_mmio_read_active, vgic_mmio_write_sactive,
-               vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
-               vgic_mmio_read_active, vgic_mmio_write_cactive,
-               vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive,
-               1, VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
-               vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
-               8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8,
-               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR,
-               vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER,
-               vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64,
-               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GICD_IDREGS,
-               vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
-               VGIC_ACCESS_32bit),
-};
-
-static const struct vgic_register_region vgic_v3_rd_registers[] = {
-       /* RD_base registers */
-       REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
-               vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GICR_STATUSR,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
-               vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GICR_TYPER,
-               vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
-               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
-               vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
-               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
-               vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
-               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
-               vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
-               VGIC_ACCESS_32bit),
-       /* SGI_base registers */
-       REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0,
-               vgic_mmio_read_group, vgic_mmio_write_group, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0,
-               vgic_mmio_read_enable, vgic_mmio_write_senable,
-               NULL, vgic_uaccess_write_senable, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0,
-               vgic_mmio_read_enable, vgic_mmio_write_cenable,
-               NULL, vgic_uaccess_write_cenable, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0,
-               vgic_mmio_read_pending, vgic_mmio_write_spending,
-               vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0,
-               vgic_mmio_read_pending, vgic_mmio_write_cpending,
-               vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0,
-               vgic_mmio_read_active, vgic_mmio_write_sactive,
-               vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0,
-               vgic_mmio_read_active, vgic_mmio_write_cactive,
-               vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0,
-               vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
-               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-       REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0,
-               vgic_mmio_read_config, vgic_mmio_write_config, 8,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-               VGIC_ACCESS_32bit),
-       REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-               VGIC_ACCESS_32bit),
-};
-
-unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
-{
-       dev->regions = vgic_v3_dist_registers;
-       dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
-
-       kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
-
-       return SZ_64K;
-}
-
-/**
- * vgic_register_redist_iodev - register a single redist iodev
- * @vcpu:    The VCPU to which the redistributor belongs
- *
- * Register a KVM iodev for this VCPU's redistributor using the address
- * provided.
- *
- * Return 0 on success, -ERRNO otherwise.
- */
-int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
-{
-       struct kvm *kvm = vcpu->kvm;
-       struct vgic_dist *vgic = &kvm->arch.vgic;
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
-       struct vgic_redist_region *rdreg;
-       gpa_t rd_base;
-       int ret;
-
-       if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr))
-               return 0;
-
-       /*
-        * We may be creating VCPUs before having set the base address for the
-        * redistributor region, in which case we will come back to this
-        * function for all VCPUs when the base address is set.  Just return
-        * without doing any work for now.
-        */
-       rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions);
-       if (!rdreg)
-               return 0;
-
-       if (!vgic_v3_check_base(kvm))
-               return -EINVAL;
-
-       vgic_cpu->rdreg = rdreg;
-
-       rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
-
-       kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
-       rd_dev->base_addr = rd_base;
-       rd_dev->iodev_type = IODEV_REDIST;
-       rd_dev->regions = vgic_v3_rd_registers;
-       rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
-       rd_dev->redist_vcpu = vcpu;
-
-       mutex_lock(&kvm->slots_lock);
-       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base,
-                                     2 * SZ_64K, &rd_dev->dev);
-       mutex_unlock(&kvm->slots_lock);
-
-       if (ret)
-               return ret;
-
-       rdreg->free_index++;
-       return 0;
-}
-
-static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu)
-{
-       struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
-
-       kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev);
-}
-
-static int vgic_register_all_redist_iodevs(struct kvm *kvm)
-{
-       struct kvm_vcpu *vcpu;
-       int c, ret = 0;
-
-       kvm_for_each_vcpu(c, vcpu, kvm) {
-               ret = vgic_register_redist_iodev(vcpu);
-               if (ret)
-                       break;
-       }
-
-       if (ret) {
-               /* The current c failed, so we start with the previous one. */
-               mutex_lock(&kvm->slots_lock);
-               for (c--; c >= 0; c--) {
-                       vcpu = kvm_get_vcpu(kvm, c);
-                       vgic_unregister_redist_iodev(vcpu);
-               }
-               mutex_unlock(&kvm->slots_lock);
-       }
-
-       return ret;
-}
-
-/**
- * vgic_v3_insert_redist_region - Insert a new redistributor region
- *
- * Performs various checks before inserting the rdist region in the list.
- * Those tests depend on whether the size of the rdist region is known
- * (ie. count != 0). The list is sorted by rdist region index.
- *
- * @kvm: kvm handle
- * @index: redist region index
- * @base: base of the new rdist region
- * @count: number of redistributors the region is made of (0 in the old style
- * single region, whose size is induced from the number of vcpus)
- *
- * Return 0 on success, < 0 otherwise
- */
-static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
-                                       gpa_t base, uint32_t count)
-{
-       struct vgic_dist *d = &kvm->arch.vgic;
-       struct vgic_redist_region *rdreg;
-       struct list_head *rd_regions = &d->rd_regions;
-       size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
-       int ret;
-
-       /* single rdist region already set ?*/
-       if (!count && !list_empty(rd_regions))
-               return -EINVAL;
-
-       /* cross the end of memory ? */
-       if (base + size < base)
-               return -EINVAL;
-
-       if (list_empty(rd_regions)) {
-               if (index != 0)
-                       return -EINVAL;
-       } else {
-               rdreg = list_last_entry(rd_regions,
-                                       struct vgic_redist_region, list);
-               if (index != rdreg->index + 1)
-                       return -EINVAL;
-
-               /* Cannot add an explicitly sized regions after legacy region */
-               if (!rdreg->count)
-                       return -EINVAL;
-       }
-
-       /*
-        * For legacy single-region redistributor regions (!count),
-        * check that the redistributor region does not overlap with the
-        * distributor's address space.
-        */
-       if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
-               vgic_dist_overlap(kvm, base, size))
-               return -EINVAL;
-
-       /* collision with any other rdist region? */
-       if (vgic_v3_rdist_overlap(kvm, base, size))
-               return -EINVAL;
-
-       rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL);
-       if (!rdreg)
-               return -ENOMEM;
-
-       rdreg->base = VGIC_ADDR_UNDEF;
-
-       ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K);
-       if (ret)
-               goto free;
-
-       rdreg->base = base;
-       rdreg->count = count;
-       rdreg->free_index = 0;
-       rdreg->index = index;
-
-       list_add_tail(&rdreg->list, rd_regions);
-       return 0;
-free:
-       kfree(rdreg);
-       return ret;
-}
-
-int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
-{
-       int ret;
-
-       ret = vgic_v3_insert_redist_region(kvm, index, addr, count);
-       if (ret)
-               return ret;
-
-       /*
-        * Register iodevs for each existing VCPU.  Adding more VCPUs
-        * afterwards will register the iodevs when needed.
-        */
-       ret = vgic_register_all_redist_iodevs(kvm);
-       if (ret)
-               return ret;
-
-       return 0;
-}
-
-int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-       const struct vgic_register_region *region;
-       struct vgic_io_device iodev;
-       struct vgic_reg_attr reg_attr;
-       struct kvm_vcpu *vcpu;
-       gpa_t addr;
-       int ret;
-
-       ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
-       if (ret)
-               return ret;
-
-       vcpu = reg_attr.vcpu;
-       addr = reg_attr.addr;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-               iodev.regions = vgic_v3_dist_registers;
-               iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
-               iodev.base_addr = 0;
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{
-               iodev.regions = vgic_v3_rd_registers;
-               iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
-               iodev.base_addr = 0;
-               break;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-               u64 reg, id;
-
-               id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
-               return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, &reg);
-       }
-       default:
-               return -ENXIO;
-       }
-
-       /* We only support aligned 32-bit accesses. */
-       if (addr & 3)
-               return -ENXIO;
-
-       region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
-       if (!region)
-               return -ENXIO;
-
-       return 0;
-}
-/*
- * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
- * generation register ICC_SGI1R_EL1) with a given VCPU.
- * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
- * return -1.
- */
-static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
-{
-       unsigned long affinity;
-       int level0;
-
-       /*
-        * Split the current VCPU's MPIDR into affinity level 0 and the
-        * rest as this is what we have to compare against.
-        */
-       affinity = kvm_vcpu_get_mpidr_aff(vcpu);
-       level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
-       affinity &= ~MPIDR_LEVEL_MASK;
-
-       /* bail out if the upper three levels don't match */
-       if (sgi_aff != affinity)
-               return -1;
-
-       /* Is this VCPU's bit set in the mask ? */
-       if (!(sgi_cpu_mask & BIT(level0)))
-               return -1;
-
-       return level0;
-}
-
-/*
- * The ICC_SGI* registers encode the affinity differently from the MPIDR,
- * so provide a wrapper to use the existing defines to isolate a certain
- * affinity level.
- */
-#define SGI_AFFINITY_LEVEL(reg, level) \
-       ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
-       >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
-
-/**
- * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
- * @vcpu: The VCPU requesting a SGI
- * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU
- * @allow_group1: Does the sysreg access allow generation of G1 SGIs
- *
- * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
- * This will trap in sys_regs.c and call this function.
- * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
- * target processors as well as a bitmask of 16 Aff0 CPUs.
- * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
- * check for matching ones. If this bit is set, we signal all, but not the
- * calling VCPU.
- */
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
-{
-       struct kvm *kvm = vcpu->kvm;
-       struct kvm_vcpu *c_vcpu;
-       u16 target_cpus;
-       u64 mpidr;
-       int sgi, c;
-       int vcpu_id = vcpu->vcpu_id;
-       bool broadcast;
-       unsigned long flags;
-
-       sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
-       broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
-       target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
-       mpidr = SGI_AFFINITY_LEVEL(reg, 3);
-       mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
-       mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
-
-       /*
-        * We iterate over all VCPUs to find the MPIDRs matching the request.
-        * If we have handled one CPU, we clear its bit to detect early
-        * if we are already finished. This avoids iterating through all
-        * VCPUs when most of the times we just signal a single VCPU.
-        */
-       kvm_for_each_vcpu(c, c_vcpu, kvm) {
-               struct vgic_irq *irq;
-
-               /* Exit early if we have dealt with all requested CPUs */
-               if (!broadcast && target_cpus == 0)
-                       break;
-
-               /* Don't signal the calling VCPU */
-               if (broadcast && c == vcpu_id)
-                       continue;
-
-               if (!broadcast) {
-                       int level0;
-
-                       level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
-                       if (level0 == -1)
-                               continue;
-
-                       /* remove this matching VCPU from the mask */
-                       target_cpus &= ~BIT(level0);
-               }
-
-               irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-               /*
-                * An access targetting Group0 SGIs can only generate
-                * those, while an access targetting Group1 SGIs can
-                * generate interrupts of either group.
-                */
-               if (!irq->group || allow_group1) {
-                       if (!irq->hw) {
-                               irq->pending_latch = true;
-                               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-                       } else {
-                               /* HW SGI? Ask the GIC to inject it */
-                               int err;
-                               err = irq_set_irqchip_state(irq->host_irq,
-                                                           IRQCHIP_STATE_PENDING,
-                                                           true);
-                               WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
-                               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-                       }
-               } else {
-                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               }
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-                        int offset, u32 *val)
-{
-       struct vgic_io_device dev = {
-               .regions = vgic_v3_dist_registers,
-               .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers),
-       };
-
-       return vgic_uaccess(vcpu, &dev, is_write, offset, val);
-}
-
-int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-                          int offset, u32 *val)
-{
-       struct vgic_io_device rd_dev = {
-               .regions = vgic_v3_rd_registers,
-               .nr_regions = ARRAY_SIZE(vgic_v3_rd_registers),
-       };
-
-       return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val);
-}
-
-int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-                                   u32 intid, u64 *val)
-{
-       if (intid % 32)
-               return -EINVAL;
-
-       if (is_write)
-               vgic_write_irq_line_level_info(vcpu, intid, *val);
-       else
-               *val = vgic_read_irq_line_level_info(vcpu, intid);
-
-       return 0;
-}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
deleted file mode 100644 (file)
index b2d73fc..0000000
+++ /dev/null
@@ -1,1088 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VGIC MMIO handling functions
- */
-
-#include <linux/bitops.h>
-#include <linux/bsearch.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <kvm/iodev.h>
-#include <kvm/arm_arch_timer.h>
-#include <kvm/arm_vgic.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
-                                gpa_t addr, unsigned int len)
-{
-       return 0;
-}
-
-unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
-                                gpa_t addr, unsigned int len)
-{
-       return -1UL;
-}
-
-void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-                       unsigned int len, unsigned long val)
-{
-       /* Ignore */
-}
-
-int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-                              unsigned int len, unsigned long val)
-{
-       /* Ignore */
-       return 0;
-}
-
-unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu,
-                                  gpa_t addr, unsigned int len)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       u32 value = 0;
-       int i;
-
-       /* Loop over all IRQs affected by this read */
-       for (i = 0; i < len * 8; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               if (irq->group)
-                       value |= BIT(i);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return value;
-}
-
-static void vgic_update_vsgi(struct vgic_irq *irq)
-{
-       WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group));
-}
-
-void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
-                          unsigned int len, unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-       unsigned long flags;
-
-       for (i = 0; i < len * 8; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               irq->group = !!(val & BIT(i));
-               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-                       vgic_update_vsgi(irq);
-                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               } else {
-                       vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-               }
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-/*
- * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value
- * of the enabled bit, so there is only one function for both here.
- */
-unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       u32 value = 0;
-       int i;
-
-       /* Loop over all IRQs affected by this read */
-       for (i = 0; i < len * 8; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               if (irq->enabled)
-                       value |= (1U << i);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return value;
-}
-
-void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
-                            gpa_t addr, unsigned int len,
-                            unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-       unsigned long flags;
-
-       for_each_set_bit(i, &val, len * 8) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-                       if (!irq->enabled) {
-                               struct irq_data *data;
-
-                               irq->enabled = true;
-                               data = &irq_to_desc(irq->host_irq)->irq_data;
-                               while (irqd_irq_disabled(data))
-                                       enable_irq(irq->host_irq);
-                       }
-
-                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-                       vgic_put_irq(vcpu->kvm, irq);
-
-                       continue;
-               } else if (vgic_irq_is_mapped_level(irq)) {
-                       bool was_high = irq->line_level;
-
-                       /*
-                        * We need to update the state of the interrupt because
-                        * the guest might have changed the state of the device
-                        * while the interrupt was disabled at the VGIC level.
-                        */
-                       irq->line_level = vgic_get_phys_line_level(irq);
-                       /*
-                        * Deactivate the physical interrupt so the GIC will let
-                        * us know when it is asserted again.
-                        */
-                       if (!irq->active && was_high && !irq->line_level)
-                               vgic_irq_set_phys_active(irq, false);
-               }
-               irq->enabled = true;
-               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
-                            gpa_t addr, unsigned int len,
-                            unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-       unsigned long flags;
-
-       for_each_set_bit(i, &val, len * 8) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled)
-                       disable_irq_nosync(irq->host_irq);
-
-               irq->enabled = false;
-
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
-                              gpa_t addr, unsigned int len,
-                              unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-       unsigned long flags;
-
-       for_each_set_bit(i, &val, len * 8) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               irq->enabled = true;
-               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return 0;
-}
-
-int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
-                              gpa_t addr, unsigned int len,
-                              unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-       unsigned long flags;
-
-       for_each_set_bit(i, &val, len * 8) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               irq->enabled = false;
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return 0;
-}
-
-unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
-                                    gpa_t addr, unsigned int len)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       u32 value = 0;
-       int i;
-
-       /* Loop over all IRQs affected by this read */
-       for (i = 0; i < len * 8; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-               unsigned long flags;
-               bool val;
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-                       int err;
-
-                       val = false;
-                       err = irq_get_irqchip_state(irq->host_irq,
-                                                   IRQCHIP_STATE_PENDING,
-                                                   &val);
-                       WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
-               } else {
-                       val = irq_is_pending(irq);
-               }
-
-               value |= ((u32)val << i);
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return value;
-}
-
-static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
-{
-       return (vgic_irq_is_sgi(irq->intid) &&
-               vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2);
-}
-
-void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
-                             gpa_t addr, unsigned int len,
-                             unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-       unsigned long flags;
-
-       for_each_set_bit(i, &val, len * 8) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               /* GICD_ISPENDR0 SGI bits are WI */
-               if (is_vgic_v2_sgi(vcpu, irq)) {
-                       vgic_put_irq(vcpu->kvm, irq);
-                       continue;
-               }
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-                       /* HW SGI? Ask the GIC to inject it */
-                       int err;
-                       err = irq_set_irqchip_state(irq->host_irq,
-                                                   IRQCHIP_STATE_PENDING,
-                                                   true);
-                       WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
-
-                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-                       vgic_put_irq(vcpu->kvm, irq);
-
-                       continue;
-               }
-
-               irq->pending_latch = true;
-               if (irq->hw)
-                       vgic_irq_set_phys_active(irq, true);
-
-               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
-                               gpa_t addr, unsigned int len,
-                               unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-       unsigned long flags;
-
-       for_each_set_bit(i, &val, len * 8) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               irq->pending_latch = true;
-
-               /*
-                * GICv2 SGIs are terribly broken. We can't restore
-                * the source of the interrupt, so just pick the vcpu
-                * itself as the source...
-                */
-               if (is_vgic_v2_sgi(vcpu, irq))
-                       irq->source |= BIT(vcpu->vcpu_id);
-
-               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return 0;
-}
-
-/* Must be called with irq->irq_lock held */
-static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
-{
-       irq->pending_latch = false;
-
-       /*
-        * We don't want the guest to effectively mask the physical
-        * interrupt by doing a write to SPENDR followed by a write to
-        * CPENDR for HW interrupts, so we clear the active state on
-        * the physical side if the virtual interrupt is not active.
-        * This may lead to taking an additional interrupt on the
-        * host, but that should not be a problem as the worst that
-        * can happen is an additional vgic injection.  We also clear
-        * the pending state to maintain proper semantics for edge HW
-        * interrupts.
-        */
-       vgic_irq_set_phys_pending(irq, false);
-       if (!irq->active)
-               vgic_irq_set_phys_active(irq, false);
-}
-
-void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
-                             gpa_t addr, unsigned int len,
-                             unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-       unsigned long flags;
-
-       for_each_set_bit(i, &val, len * 8) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               /* GICD_ICPENDR0 SGI bits are WI */
-               if (is_vgic_v2_sgi(vcpu, irq)) {
-                       vgic_put_irq(vcpu->kvm, irq);
-                       continue;
-               }
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-                       /* HW SGI? Ask the GIC to clear its pending bit */
-                       int err;
-                       err = irq_set_irqchip_state(irq->host_irq,
-                                                   IRQCHIP_STATE_PENDING,
-                                                   false);
-                       WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
-
-                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-                       vgic_put_irq(vcpu->kvm, irq);
-
-                       continue;
-               }
-
-               if (irq->hw)
-                       vgic_hw_irq_cpending(vcpu, irq);
-               else
-                       irq->pending_latch = false;
-
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
-                               gpa_t addr, unsigned int len,
-                               unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-       unsigned long flags;
-
-       for_each_set_bit(i, &val, len * 8) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               /*
-                * More fun with GICv2 SGIs! If we're clearing one of them
-                * from userspace, which source vcpu to clear? Let's not
-                * even think of it, and blow the whole set.
-                */
-               if (is_vgic_v2_sgi(vcpu, irq))
-                       irq->source = 0;
-
-               irq->pending_latch = false;
-
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return 0;
-}
-
-/*
- * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
- * is not queued on some running VCPU's LRs, because then the change to the
- * active state can be overwritten when the VCPU's state is synced coming back
- * from the guest.
- *
- * For shared interrupts as well as GICv3 private interrupts, we have to
- * stop all the VCPUs because interrupts can be migrated while we don't hold
- * the IRQ locks and we don't want to be chasing moving targets.
- *
- * For GICv2 private interrupts we don't have to do anything because
- * userspace accesses to the VGIC state already require all VCPUs to be
- * stopped, and only the VCPU itself can modify its private interrupts
- * active state, which guarantees that the VCPU is not running.
- */
-static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
-{
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
-           intid >= VGIC_NR_PRIVATE_IRQS)
-               kvm_arm_halt_guest(vcpu->kvm);
-}
-
-/* See vgic_access_active_prepare */
-static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid)
-{
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
-           intid >= VGIC_NR_PRIVATE_IRQS)
-               kvm_arm_resume_guest(vcpu->kvm);
-}
-
-static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu,
-                                            gpa_t addr, unsigned int len)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       u32 value = 0;
-       int i;
-
-       /* Loop over all IRQs affected by this read */
-       for (i = 0; i < len * 8; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               /*
-                * Even for HW interrupts, don't evaluate the HW state as
-                * all the guest is interested in is the virtual state.
-                */
-               if (irq->active)
-                       value |= (1U << i);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return value;
-}
-
-unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       u32 val;
-
-       mutex_lock(&vcpu->kvm->lock);
-       vgic_access_active_prepare(vcpu, intid);
-
-       val = __vgic_mmio_read_active(vcpu, addr, len);
-
-       vgic_access_active_finish(vcpu, intid);
-       mutex_unlock(&vcpu->kvm->lock);
-
-       return val;
-}
-
-unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len)
-{
-       return __vgic_mmio_read_active(vcpu, addr, len);
-}
-
-/* Must be called with irq->irq_lock held */
-static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-                                     bool active, bool is_uaccess)
-{
-       if (is_uaccess)
-               return;
-
-       irq->active = active;
-       vgic_irq_set_phys_active(irq, active);
-}
-
-static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-                                   bool active)
-{
-       unsigned long flags;
-       struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu();
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-       if (irq->hw && !vgic_irq_is_sgi(irq->intid)) {
-               vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
-       } else if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-               /*
-                * GICv4.1 VSGI feature doesn't track an active state,
-                * so let's not kid ourselves, there is nothing we can
-                * do here.
-                */
-               irq->active = false;
-       } else {
-               u32 model = vcpu->kvm->arch.vgic.vgic_model;
-               u8 active_source;
-
-               irq->active = active;
-
-               /*
-                * The GICv2 architecture indicates that the source CPUID for
-                * an SGI should be provided during an EOI which implies that
-                * the active state is stored somewhere, but at the same time
-                * this state is not architecturally exposed anywhere and we
-                * have no way of knowing the right source.
-                *
-                * This may lead to a VCPU not being able to receive
-                * additional instances of a particular SGI after migration
-                * for a GICv2 VM on some GIC implementations.  Oh well.
-                */
-               active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0;
-
-               if (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
-                   active && vgic_irq_is_sgi(irq->intid))
-                       irq->active_source = active_source;
-       }
-
-       if (irq->active)
-               vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-       else
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-}
-
-static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
-                                     gpa_t addr, unsigned int len,
-                                     unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-
-       for_each_set_bit(i, &val, len * 8) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-               vgic_mmio_change_active(vcpu, irq, false);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
-                            gpa_t addr, unsigned int len,
-                            unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-
-       mutex_lock(&vcpu->kvm->lock);
-       vgic_access_active_prepare(vcpu, intid);
-
-       __vgic_mmio_write_cactive(vcpu, addr, len, val);
-
-       vgic_access_active_finish(vcpu, intid);
-       mutex_unlock(&vcpu->kvm->lock);
-}
-
-int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
-                                    gpa_t addr, unsigned int len,
-                                    unsigned long val)
-{
-       __vgic_mmio_write_cactive(vcpu, addr, len, val);
-       return 0;
-}
-
-static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
-                                     gpa_t addr, unsigned int len,
-                                     unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       int i;
-
-       for_each_set_bit(i, &val, len * 8) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-               vgic_mmio_change_active(vcpu, irq, true);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
-                            gpa_t addr, unsigned int len,
-                            unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-
-       mutex_lock(&vcpu->kvm->lock);
-       vgic_access_active_prepare(vcpu, intid);
-
-       __vgic_mmio_write_sactive(vcpu, addr, len, val);
-
-       vgic_access_active_finish(vcpu, intid);
-       mutex_unlock(&vcpu->kvm->lock);
-}
-
-int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
-                                    gpa_t addr, unsigned int len,
-                                    unsigned long val)
-{
-       __vgic_mmio_write_sactive(vcpu, addr, len, val);
-       return 0;
-}
-
-unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
-                                     gpa_t addr, unsigned int len)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-       int i;
-       u64 val = 0;
-
-       for (i = 0; i < len; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               val |= (u64)irq->priority << (i * 8);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return val;
-}
-
-/*
- * We currently don't handle changing the priority of an interrupt that
- * is already pending on a VCPU. If there is a need for this, we would
- * need to make this VCPU exit and re-evaluate the priorities, potentially
- * leading to this interrupt getting presented now to the guest (if it has
- * been masked by the priority mask before).
- */
-void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
-                             gpa_t addr, unsigned int len,
-                             unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-       int i;
-       unsigned long flags;
-
-       for (i = 0; i < len; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               /* Narrow the priority range to what we actually support */
-               irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
-               if (irq->hw && vgic_irq_is_sgi(irq->intid))
-                       vgic_update_vsgi(irq);
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
-       u32 value = 0;
-       int i;
-
-       for (i = 0; i < len * 4; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               if (irq->config == VGIC_CONFIG_EDGE)
-                       value |= (2U << (i * 2));
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return value;
-}
-
-void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
-                           gpa_t addr, unsigned int len,
-                           unsigned long val)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
-       int i;
-       unsigned long flags;
-
-       for (i = 0; i < len * 4; i++) {
-               struct vgic_irq *irq;
-
-               /*
-                * The configuration cannot be changed for SGIs in general,
-                * for PPIs this is IMPLEMENTATION DEFINED. The arch timer
-                * code relies on PPIs being level triggered, so we also
-                * make them read-only here.
-                */
-               if (intid + i < VGIC_NR_PRIVATE_IRQS)
-                       continue;
-
-               irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-               if (test_bit(i * 2 + 1, &val))
-                       irq->config = VGIC_CONFIG_EDGE;
-               else
-                       irq->config = VGIC_CONFIG_LEVEL;
-
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid)
-{
-       int i;
-       u64 val = 0;
-       int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
-
-       for (i = 0; i < 32; i++) {
-               struct vgic_irq *irq;
-
-               if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
-                       continue;
-
-               irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-               if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level)
-                       val |= (1U << i);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return val;
-}
-
-void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
-                                   const u64 val)
-{
-       int i;
-       int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
-       unsigned long flags;
-
-       for (i = 0; i < 32; i++) {
-               struct vgic_irq *irq;
-               bool new_level;
-
-               if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
-                       continue;
-
-               irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-               /*
-                * Line level is set irrespective of irq type
-                * (level or edge) to avoid dependency that VM should
-                * restore irq config before line level.
-                */
-               new_level = !!(val & (1U << i));
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               irq->line_level = new_level;
-               if (new_level)
-                       vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-               else
-                       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-static int match_region(const void *key, const void *elt)
-{
-       const unsigned int offset = (unsigned long)key;
-       const struct vgic_register_region *region = elt;
-
-       if (offset < region->reg_offset)
-               return -1;
-
-       if (offset >= region->reg_offset + region->len)
-               return 1;
-
-       return 0;
-}
-
-const struct vgic_register_region *
-vgic_find_mmio_region(const struct vgic_register_region *regions,
-                     int nr_regions, unsigned int offset)
-{
-       return bsearch((void *)(uintptr_t)offset, regions, nr_regions,
-                      sizeof(regions[0]), match_region);
-}
-
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-       if (kvm_vgic_global_state.type == VGIC_V2)
-               vgic_v2_set_vmcr(vcpu, vmcr);
-       else
-               vgic_v3_set_vmcr(vcpu, vmcr);
-}
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-       if (kvm_vgic_global_state.type == VGIC_V2)
-               vgic_v2_get_vmcr(vcpu, vmcr);
-       else
-               vgic_v3_get_vmcr(vcpu, vmcr);
-}
-
-/*
- * kvm_mmio_read_buf() returns a value in a format where it can be converted
- * to a byte array and be directly observed as the guest wanted it to appear
- * in memory if it had done the store itself, which is LE for the GIC, as the
- * guest knows the GIC is always LE.
- *
- * We convert this value to the CPUs native format to deal with it as a data
- * value.
- */
-unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len)
-{
-       unsigned long data = kvm_mmio_read_buf(val, len);
-
-       switch (len) {
-       case 1:
-               return data;
-       case 2:
-               return le16_to_cpu(data);
-       case 4:
-               return le32_to_cpu(data);
-       default:
-               return le64_to_cpu(data);
-       }
-}
-
-/*
- * kvm_mmio_write_buf() expects a value in a format such that if converted to
- * a byte array it is observed as the guest would see it if it could perform
- * the load directly.  Since the GIC is LE, and the guest knows this, the
- * guest expects a value in little endian format.
- *
- * We convert the data value from the CPUs native format to LE so that the
- * value is returned in the proper format.
- */
-void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
-                               unsigned long data)
-{
-       switch (len) {
-       case 1:
-               break;
-       case 2:
-               data = cpu_to_le16(data);
-               break;
-       case 4:
-               data = cpu_to_le32(data);
-               break;
-       default:
-               data = cpu_to_le64(data);
-       }
-
-       kvm_mmio_write_buf(buf, len, data);
-}
-
-static
-struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev)
-{
-       return container_of(dev, struct vgic_io_device, dev);
-}
-
-static bool check_region(const struct kvm *kvm,
-                        const struct vgic_register_region *region,
-                        gpa_t addr, int len)
-{
-       int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
-
-       switch (len) {
-       case sizeof(u8):
-               flags = VGIC_ACCESS_8bit;
-               break;
-       case sizeof(u32):
-               flags = VGIC_ACCESS_32bit;
-               break;
-       case sizeof(u64):
-               flags = VGIC_ACCESS_64bit;
-               break;
-       default:
-               return false;
-       }
-
-       if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) {
-               if (!region->bits_per_irq)
-                       return true;
-
-               /* Do we access a non-allocated IRQ? */
-               return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs;
-       }
-
-       return false;
-}
-
-const struct vgic_register_region *
-vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
-                    gpa_t addr, int len)
-{
-       const struct vgic_register_region *region;
-
-       region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
-                                      addr - iodev->base_addr);
-       if (!region || !check_region(vcpu->kvm, region, addr, len))
-               return NULL;
-
-       return region;
-}
-
-static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-                            gpa_t addr, u32 *val)
-{
-       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-       const struct vgic_register_region *region;
-       struct kvm_vcpu *r_vcpu;
-
-       region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
-       if (!region) {
-               *val = 0;
-               return 0;
-       }
-
-       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-       if (region->uaccess_read)
-               *val = region->uaccess_read(r_vcpu, addr, sizeof(u32));
-       else
-               *val = region->read(r_vcpu, addr, sizeof(u32));
-
-       return 0;
-}
-
-static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-                             gpa_t addr, const u32 *val)
-{
-       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-       const struct vgic_register_region *region;
-       struct kvm_vcpu *r_vcpu;
-
-       region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
-       if (!region)
-               return 0;
-
-       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-       if (region->uaccess_write)
-               return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val);
-
-       region->write(r_vcpu, addr, sizeof(u32), *val);
-       return 0;
-}
-
-/*
- * Userland access to VGIC registers.
- */
-int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
-                bool is_write, int offset, u32 *val)
-{
-       if (is_write)
-               return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
-       else
-               return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
-}
-
-static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-                             gpa_t addr, int len, void *val)
-{
-       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-       const struct vgic_register_region *region;
-       unsigned long data = 0;
-
-       region = vgic_get_mmio_region(vcpu, iodev, addr, len);
-       if (!region) {
-               memset(val, 0, len);
-               return 0;
-       }
-
-       switch (iodev->iodev_type) {
-       case IODEV_CPUIF:
-               data = region->read(vcpu, addr, len);
-               break;
-       case IODEV_DIST:
-               data = region->read(vcpu, addr, len);
-               break;
-       case IODEV_REDIST:
-               data = region->read(iodev->redist_vcpu, addr, len);
-               break;
-       case IODEV_ITS:
-               data = region->its_read(vcpu->kvm, iodev->its, addr, len);
-               break;
-       }
-
-       vgic_data_host_to_mmio_bus(val, len, data);
-       return 0;
-}
-
-static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-                              gpa_t addr, int len, const void *val)
-{
-       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-       const struct vgic_register_region *region;
-       unsigned long data = vgic_data_mmio_bus_to_host(val, len);
-
-       region = vgic_get_mmio_region(vcpu, iodev, addr, len);
-       if (!region)
-               return 0;
-
-       switch (iodev->iodev_type) {
-       case IODEV_CPUIF:
-               region->write(vcpu, addr, len, data);
-               break;
-       case IODEV_DIST:
-               region->write(vcpu, addr, len, data);
-               break;
-       case IODEV_REDIST:
-               region->write(iodev->redist_vcpu, addr, len, data);
-               break;
-       case IODEV_ITS:
-               region->its_write(vcpu->kvm, iodev->its, addr, len, data);
-               break;
-       }
-
-       return 0;
-}
-
-struct kvm_io_device_ops kvm_io_gic_ops = {
-       .read = dispatch_mmio_read,
-       .write = dispatch_mmio_write,
-};
-
-int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
-                            enum vgic_type type)
-{
-       struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev;
-       int ret = 0;
-       unsigned int len;
-
-       switch (type) {
-       case VGIC_V2:
-               len = vgic_v2_init_dist_iodev(io_device);
-               break;
-       case VGIC_V3:
-               len = vgic_v3_init_dist_iodev(io_device);
-               break;
-       default:
-               BUG_ON(1);
-       }
-
-       io_device->base_addr = dist_base_address;
-       io_device->iodev_type = IODEV_DIST;
-       io_device->redist_vcpu = NULL;
-
-       mutex_lock(&kvm->slots_lock);
-       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address,
-                                     len, &io_device->dev);
-       mutex_unlock(&kvm->slots_lock);
-
-       return ret;
-}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
deleted file mode 100644 (file)
index fefcca2..0000000
+++ /dev/null
@@ -1,227 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-#ifndef __KVM_ARM_VGIC_MMIO_H__
-#define __KVM_ARM_VGIC_MMIO_H__
-
-struct vgic_register_region {
-       unsigned int reg_offset;
-       unsigned int len;
-       unsigned int bits_per_irq;
-       unsigned int access_flags;
-       union {
-               unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
-                                     unsigned int len);
-               unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its,
-                                         gpa_t addr, unsigned int len);
-       };
-       union {
-               void (*write)(struct kvm_vcpu *vcpu, gpa_t addr,
-                             unsigned int len, unsigned long val);
-               void (*its_write)(struct kvm *kvm, struct vgic_its *its,
-                                 gpa_t addr, unsigned int len,
-                                 unsigned long val);
-       };
-       unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr,
-                                     unsigned int len);
-       union {
-               int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr,
-                                    unsigned int len, unsigned long val);
-               int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its,
-                                        gpa_t addr, unsigned int len,
-                                        unsigned long val);
-       };
-};
-
-extern struct kvm_io_device_ops kvm_io_gic_ops;
-
-#define VGIC_ACCESS_8bit       1
-#define VGIC_ACCESS_32bit      2
-#define VGIC_ACCESS_64bit      4
-
-/*
- * Generate a mask that covers the number of bytes required to address
- * up to 1024 interrupts, each represented by <bits> bits. This assumes
- * that <bits> is a power of two.
- */
-#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1)
-
-/*
- * (addr & mask) gives us the _byte_ offset for the INT ID.
- * We multiply this by 8 the get the _bit_ offset, then divide this by
- * the number of bits to learn the actual INT ID.
- * But instead of a division (which requires a "long long div" implementation),
- * we shift by the binary logarithm of <bits>.
- * This assumes that <bits> is a power of two.
- */
-#define VGIC_ADDR_TO_INTID(addr, bits)  (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \
-                                       8 >> ilog2(bits))
-
-/*
- * Some VGIC registers store per-IRQ information, with a different number
- * of bits per IRQ. For those registers this macro is used.
- * The _WITH_LENGTH version instantiates registers with a fixed length
- * and is mutually exclusive with the _PER_IRQ version.
- */
-#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc) \
-       {                                                               \
-               .reg_offset = off,                                      \
-               .bits_per_irq = bpi,                                    \
-               .len = bpi * 1024 / 8,                                  \
-               .access_flags = acc,                                    \
-               .read = rd,                                             \
-               .write = wr,                                            \
-               .uaccess_read = ur,                                     \
-               .uaccess_write = uw,                                    \
-       }
-
-#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc)            \
-       {                                                               \
-               .reg_offset = off,                                      \
-               .bits_per_irq = 0,                                      \
-               .len = length,                                          \
-               .access_flags = acc,                                    \
-               .read = rd,                                             \
-               .write = wr,                                            \
-       }
-
-#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \
-       {                                                               \
-               .reg_offset = off,                                      \
-               .bits_per_irq = 0,                                      \
-               .len = length,                                          \
-               .access_flags = acc,                                    \
-               .read = rd,                                             \
-               .write = wr,                                            \
-               .uaccess_read = urd,                                    \
-               .uaccess_write = uwr,                                   \
-       }
-
-unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
-
-void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
-                               unsigned long data);
-
-unsigned long extract_bytes(u64 data, unsigned int offset,
-                           unsigned int num);
-
-u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
-                    unsigned long val);
-
-unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
-                                gpa_t addr, unsigned int len);
-
-unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
-                                gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-                       unsigned int len, unsigned long val);
-
-int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-                              unsigned int len, unsigned long val);
-
-unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr,
-                                  unsigned int len);
-
-void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
-                          unsigned int len, unsigned long val);
-
-unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
-                            gpa_t addr, unsigned int len,
-                            unsigned long val);
-
-void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
-                            gpa_t addr, unsigned int len,
-                            unsigned long val);
-
-int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
-                              gpa_t addr, unsigned int len,
-                              unsigned long val);
-
-int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
-                              gpa_t addr, unsigned int len,
-                              unsigned long val);
-
-unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
-                                    gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
-                             gpa_t addr, unsigned int len,
-                             unsigned long val);
-
-void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
-                             gpa_t addr, unsigned int len,
-                             unsigned long val);
-
-int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
-                               gpa_t addr, unsigned int len,
-                               unsigned long val);
-
-int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
-                               gpa_t addr, unsigned int len,
-                               unsigned long val);
-
-unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len);
-
-unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
-                            gpa_t addr, unsigned int len,
-                            unsigned long val);
-
-void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
-                            gpa_t addr, unsigned int len,
-                            unsigned long val);
-
-int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len,
-                                   unsigned long val);
-
-int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len,
-                                   unsigned long val);
-
-unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
-                                     gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
-                             gpa_t addr, unsigned int len,
-                             unsigned long val);
-
-unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
-                                   gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
-                           gpa_t addr, unsigned int len,
-                           unsigned long val);
-
-int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
-                bool is_write, int offset, u32 *val);
-
-u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid);
-
-void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
-                                   const u64 val);
-
-unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
-
-unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
-
-u64 vgic_sanitise_outer_cacheability(u64 reg);
-u64 vgic_sanitise_inner_cacheability(u64 reg);
-u64 vgic_sanitise_shareability(u64 reg);
-u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
-                       u64 (*sanitise_fn)(u64));
-
-/* Find the proper register handler entry given a certain address offset */
-const struct vgic_register_region *
-vgic_find_mmio_region(const struct vgic_register_region *regions,
-                     int nr_regions, unsigned int offset);
-
-#endif
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
deleted file mode 100644 (file)
index 621cc16..0000000
+++ /dev/null
@@ -1,504 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-
-#include <linux/irqchip/arm-gic.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-static inline void vgic_v2_write_lr(int lr, u32 val)
-{
-       void __iomem *base = kvm_vgic_global_state.vctrl_base;
-
-       writel_relaxed(val, base + GICH_LR0 + (lr * 4));
-}
-
-void vgic_v2_init_lrs(void)
-{
-       int i;
-
-       for (i = 0; i < kvm_vgic_global_state.nr_lr; i++)
-               vgic_v2_write_lr(i, 0);
-}
-
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
-
-       cpuif->vgic_hcr |= GICH_HCR_UIE;
-}
-
-static bool lr_signals_eoi_mi(u32 lr_val)
-{
-       return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) &&
-              !(lr_val & GICH_LR_HW);
-}
-
-/*
- * transfer the content of the LRs back into the corresponding ap_list:
- * - active bit is transferred as is
- * - pending bit is
- *   - transferred as is in case of edge sensitive IRQs
- *   - set to the line-level (resample time) for level sensitive IRQs
- */
-void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
-       int lr;
-
-       DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-       cpuif->vgic_hcr &= ~GICH_HCR_UIE;
-
-       for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
-               u32 val = cpuif->vgic_lr[lr];
-               u32 cpuid, intid = val & GICH_LR_VIRTUALID;
-               struct vgic_irq *irq;
-
-               /* Extract the source vCPU id from the LR */
-               cpuid = val & GICH_LR_PHYSID_CPUID;
-               cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
-               cpuid &= 7;
-
-               /* Notify fds when the guest EOI'ed a level-triggered SPI */
-               if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
-                       kvm_notify_acked_irq(vcpu->kvm, 0,
-                                            intid - VGIC_NR_PRIVATE_IRQS);
-
-               irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
-
-               raw_spin_lock(&irq->irq_lock);
-
-               /* Always preserve the active bit */
-               irq->active = !!(val & GICH_LR_ACTIVE_BIT);
-
-               if (irq->active && vgic_irq_is_sgi(intid))
-                       irq->active_source = cpuid;
-
-               /* Edge is the only case where we preserve the pending bit */
-               if (irq->config == VGIC_CONFIG_EDGE &&
-                   (val & GICH_LR_PENDING_BIT)) {
-                       irq->pending_latch = true;
-
-                       if (vgic_irq_is_sgi(intid))
-                               irq->source |= (1 << cpuid);
-               }
-
-               /*
-                * Clear soft pending state when level irqs have been acked.
-                */
-               if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
-                       irq->pending_latch = false;
-
-               /*
-                * Level-triggered mapped IRQs are special because we only
-                * observe rising edges as input to the VGIC.
-                *
-                * If the guest never acked the interrupt we have to sample
-                * the physical line and set the line level, because the
-                * device state could have changed or we simply need to
-                * process the still pending interrupt later.
-                *
-                * If this causes us to lower the level, we have to also clear
-                * the physical active state, since we will otherwise never be
-                * told when the interrupt becomes asserted again.
-                */
-               if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) {
-                       irq->line_level = vgic_get_phys_line_level(irq);
-
-                       if (!irq->line_level)
-                               vgic_irq_set_phys_active(irq, false);
-               }
-
-               raw_spin_unlock(&irq->irq_lock);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       vgic_cpu->used_lrs = 0;
-}
-
-/*
- * Populates the particular LR with the state of a given IRQ:
- * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
- * - for a level sensitive IRQ the pending state value is unchanged;
- *   it is dictated directly by the input level
- *
- * If @irq describes an SGI with multiple sources, we choose the
- * lowest-numbered source VCPU and clear that bit in the source bitmap.
- *
- * The irq_lock must be held by the caller.
- */
-void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
-{
-       u32 val = irq->intid;
-       bool allow_pending = true;
-
-       if (irq->active) {
-               val |= GICH_LR_ACTIVE_BIT;
-               if (vgic_irq_is_sgi(irq->intid))
-                       val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
-               if (vgic_irq_is_multi_sgi(irq)) {
-                       allow_pending = false;
-                       val |= GICH_LR_EOI;
-               }
-       }
-
-       if (irq->group)
-               val |= GICH_LR_GROUP1;
-
-       if (irq->hw) {
-               val |= GICH_LR_HW;
-               val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
-               /*
-                * Never set pending+active on a HW interrupt, as the
-                * pending state is kept at the physical distributor
-                * level.
-                */
-               if (irq->active)
-                       allow_pending = false;
-       } else {
-               if (irq->config == VGIC_CONFIG_LEVEL) {
-                       val |= GICH_LR_EOI;
-
-                       /*
-                        * Software resampling doesn't work very well
-                        * if we allow P+A, so let's not do that.
-                        */
-                       if (irq->active)
-                               allow_pending = false;
-               }
-       }
-
-       if (allow_pending && irq_is_pending(irq)) {
-               val |= GICH_LR_PENDING_BIT;
-
-               if (irq->config == VGIC_CONFIG_EDGE)
-                       irq->pending_latch = false;
-
-               if (vgic_irq_is_sgi(irq->intid)) {
-                       u32 src = ffs(irq->source);
-
-                       if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
-                                          irq->intid))
-                               return;
-
-                       val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
-                       irq->source &= ~(1 << (src - 1));
-                       if (irq->source) {
-                               irq->pending_latch = true;
-                               val |= GICH_LR_EOI;
-                       }
-               }
-       }
-
-       /*
-        * Level-triggered mapped IRQs are special because we only observe
-        * rising edges as input to the VGIC.  We therefore lower the line
-        * level here, so that we can take new virtual IRQs.  See
-        * vgic_v2_fold_lr_state for more info.
-        */
-       if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT))
-               irq->line_level = false;
-
-       /* The GICv2 LR only holds five bits of priority. */
-       val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
-
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
-}
-
-void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0;
-}
-
-void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       u32 vmcr;
-
-       vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) &
-               GICH_VMCR_ENABLE_GRP0_MASK;
-       vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) &
-               GICH_VMCR_ENABLE_GRP1_MASK;
-       vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) &
-               GICH_VMCR_ACK_CTL_MASK;
-       vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) &
-               GICH_VMCR_FIQ_EN_MASK;
-       vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) &
-               GICH_VMCR_CBPR_MASK;
-       vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) &
-               GICH_VMCR_EOI_MODE_MASK;
-       vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) &
-               GICH_VMCR_ALIAS_BINPOINT_MASK;
-       vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
-               GICH_VMCR_BINPOINT_MASK;
-       vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) <<
-                GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
-
-       cpu_if->vgic_vmcr = vmcr;
-}
-
-void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       u32 vmcr;
-
-       vmcr = cpu_if->vgic_vmcr;
-
-       vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >>
-               GICH_VMCR_ENABLE_GRP0_SHIFT;
-       vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >>
-               GICH_VMCR_ENABLE_GRP1_SHIFT;
-       vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >>
-               GICH_VMCR_ACK_CTL_SHIFT;
-       vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >>
-               GICH_VMCR_FIQ_EN_SHIFT;
-       vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >>
-               GICH_VMCR_CBPR_SHIFT;
-       vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >>
-               GICH_VMCR_EOI_MODE_SHIFT;
-
-       vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >>
-                       GICH_VMCR_ALIAS_BINPOINT_SHIFT;
-       vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
-                       GICH_VMCR_BINPOINT_SHIFT;
-       vmcrp->pmr  = ((vmcr & GICH_VMCR_PRIMASK_MASK) >>
-                       GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
-}
-
-void vgic_v2_enable(struct kvm_vcpu *vcpu)
-{
-       /*
-        * By forcing VMCR to zero, the GIC will restore the binary
-        * points to their reset values. Anything else resets to zero
-        * anyway.
-        */
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
-
-       /* Get the show on the road... */
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
-}
-
-/* check for overlapping regions and for regions crossing the end of memory */
-static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
-{
-       if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base)
-               return false;
-       if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base)
-               return false;
-
-       if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base)
-               return true;
-       if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base)
-               return true;
-
-       return false;
-}
-
-int vgic_v2_map_resources(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int ret = 0;
-
-       if (vgic_ready(kvm))
-               goto out;
-
-       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
-           IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
-               kvm_err("Need to set vgic cpu and dist addresses first\n");
-               ret = -ENXIO;
-               goto out;
-       }
-
-       if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) {
-               kvm_err("VGIC CPU and dist frames overlap\n");
-               ret = -EINVAL;
-               goto out;
-       }
-
-       /*
-        * Initialize the vgic if this hasn't already been done on demand by
-        * accessing the vgic state from userspace.
-        */
-       ret = vgic_init(kvm);
-       if (ret) {
-               kvm_err("Unable to initialize VGIC dynamic data structures\n");
-               goto out;
-       }
-
-       ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2);
-       if (ret) {
-               kvm_err("Unable to register VGIC MMIO regions\n");
-               goto out;
-       }
-
-       if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
-               ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
-                                           kvm_vgic_global_state.vcpu_base,
-                                           KVM_VGIC_V2_CPU_SIZE, true);
-               if (ret) {
-                       kvm_err("Unable to remap VGIC CPU to VCPU\n");
-                       goto out;
-               }
-       }
-
-       dist->ready = true;
-
-out:
-       return ret;
-}
-
-DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap);
-
-/**
- * vgic_v2_probe - probe for a VGICv2 compatible interrupt controller
- * @info:      pointer to the GIC description
- *
- * Returns 0 if the VGICv2 has been probed successfully, returns an error code
- * otherwise
- */
-int vgic_v2_probe(const struct gic_kvm_info *info)
-{
-       int ret;
-       u32 vtr;
-
-       if (!info->vctrl.start) {
-               kvm_err("GICH not present in the firmware table\n");
-               return -ENXIO;
-       }
-
-       if (!PAGE_ALIGNED(info->vcpu.start) ||
-           !PAGE_ALIGNED(resource_size(&info->vcpu))) {
-               kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n");
-
-               ret = create_hyp_io_mappings(info->vcpu.start,
-                                            resource_size(&info->vcpu),
-                                            &kvm_vgic_global_state.vcpu_base_va,
-                                            &kvm_vgic_global_state.vcpu_hyp_va);
-               if (ret) {
-                       kvm_err("Cannot map GICV into hyp\n");
-                       goto out;
-               }
-
-               static_branch_enable(&vgic_v2_cpuif_trap);
-       }
-
-       ret = create_hyp_io_mappings(info->vctrl.start,
-                                    resource_size(&info->vctrl),
-                                    &kvm_vgic_global_state.vctrl_base,
-                                    &kvm_vgic_global_state.vctrl_hyp);
-       if (ret) {
-               kvm_err("Cannot map VCTRL into hyp\n");
-               goto out;
-       }
-
-       vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
-       kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
-
-       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-       if (ret) {
-               kvm_err("Cannot register GICv2 KVM device\n");
-               goto out;
-       }
-
-       kvm_vgic_global_state.can_emulate_gicv2 = true;
-       kvm_vgic_global_state.vcpu_base = info->vcpu.start;
-       kvm_vgic_global_state.type = VGIC_V2;
-       kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
-
-       kvm_debug("vgic-v2@%llx\n", info->vctrl.start);
-
-       return 0;
-out:
-       if (kvm_vgic_global_state.vctrl_base)
-               iounmap(kvm_vgic_global_state.vctrl_base);
-       if (kvm_vgic_global_state.vcpu_base_va)
-               iounmap(kvm_vgic_global_state.vcpu_base_va);
-
-       return ret;
-}
-
-static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
-{
-       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-       u64 elrsr;
-       int i;
-
-       elrsr = readl_relaxed(base + GICH_ELRSR0);
-       if (unlikely(used_lrs > 32))
-               elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32;
-
-       for (i = 0; i < used_lrs; i++) {
-               if (elrsr & (1UL << i))
-                       cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
-               else
-                       cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
-
-               writel_relaxed(0, base + GICH_LR0 + (i * 4));
-       }
-}
-
-void vgic_v2_save_state(struct kvm_vcpu *vcpu)
-{
-       void __iomem *base = kvm_vgic_global_state.vctrl_base;
-       u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-
-       if (!base)
-               return;
-
-       if (used_lrs) {
-               save_lrs(vcpu, base);
-               writel_relaxed(0, base + GICH_HCR);
-       }
-}
-
-void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       void __iomem *base = kvm_vgic_global_state.vctrl_base;
-       u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-       int i;
-
-       if (!base)
-               return;
-
-       if (used_lrs) {
-               writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
-               for (i = 0; i < used_lrs; i++) {
-                       writel_relaxed(cpu_if->vgic_lr[i],
-                                      base + GICH_LR0 + (i * 4));
-               }
-       }
-}
-
-void vgic_v2_load(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
-       writel_relaxed(cpu_if->vgic_vmcr,
-                      kvm_vgic_global_state.vctrl_base + GICH_VMCR);
-       writel_relaxed(cpu_if->vgic_apr,
-                      kvm_vgic_global_state.vctrl_base + GICH_APR);
-}
-
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
-       cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
-}
-
-void vgic_v2_put(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
-       vgic_v2_vmcr_sync(vcpu);
-       cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
-}
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
deleted file mode 100644 (file)
index 2c9fc13..0000000
+++ /dev/null
@@ -1,693 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_hyp.h>
-#include <asm/kvm_mmu.h>
-#include <asm/kvm_asm.h>
-
-#include "vgic.h"
-
-static bool group0_trap;
-static bool group1_trap;
-static bool common_trap;
-static bool gicv4_enable;
-
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
-
-       cpuif->vgic_hcr |= ICH_HCR_UIE;
-}
-
-static bool lr_signals_eoi_mi(u64 lr_val)
-{
-       return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) &&
-              !(lr_val & ICH_LR_HW);
-}
-
-void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
-       u32 model = vcpu->kvm->arch.vgic.vgic_model;
-       int lr;
-
-       DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-       cpuif->vgic_hcr &= ~ICH_HCR_UIE;
-
-       for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
-               u64 val = cpuif->vgic_lr[lr];
-               u32 intid, cpuid;
-               struct vgic_irq *irq;
-               bool is_v2_sgi = false;
-
-               cpuid = val & GICH_LR_PHYSID_CPUID;
-               cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
-
-               if (model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-                       intid = val & ICH_LR_VIRTUAL_ID_MASK;
-               } else {
-                       intid = val & GICH_LR_VIRTUALID;
-                       is_v2_sgi = vgic_irq_is_sgi(intid);
-               }
-
-               /* Notify fds when the guest EOI'ed a level-triggered IRQ */
-               if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
-                       kvm_notify_acked_irq(vcpu->kvm, 0,
-                                            intid - VGIC_NR_PRIVATE_IRQS);
-
-               irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
-               if (!irq)       /* An LPI could have been unmapped. */
-                       continue;
-
-               raw_spin_lock(&irq->irq_lock);
-
-               /* Always preserve the active bit */
-               irq->active = !!(val & ICH_LR_ACTIVE_BIT);
-
-               if (irq->active && is_v2_sgi)
-                       irq->active_source = cpuid;
-
-               /* Edge is the only case where we preserve the pending bit */
-               if (irq->config == VGIC_CONFIG_EDGE &&
-                   (val & ICH_LR_PENDING_BIT)) {
-                       irq->pending_latch = true;
-
-                       if (is_v2_sgi)
-                               irq->source |= (1 << cpuid);
-               }
-
-               /*
-                * Clear soft pending state when level irqs have been acked.
-                */
-               if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
-                       irq->pending_latch = false;
-
-               /*
-                * Level-triggered mapped IRQs are special because we only
-                * observe rising edges as input to the VGIC.
-                *
-                * If the guest never acked the interrupt we have to sample
-                * the physical line and set the line level, because the
-                * device state could have changed or we simply need to
-                * process the still pending interrupt later.
-                *
-                * If this causes us to lower the level, we have to also clear
-                * the physical active state, since we will otherwise never be
-                * told when the interrupt becomes asserted again.
-                */
-               if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) {
-                       irq->line_level = vgic_get_phys_line_level(irq);
-
-                       if (!irq->line_level)
-                               vgic_irq_set_phys_active(irq, false);
-               }
-
-               raw_spin_unlock(&irq->irq_lock);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       vgic_cpu->used_lrs = 0;
-}
-
-/* Requires the irq to be locked already */
-void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
-{
-       u32 model = vcpu->kvm->arch.vgic.vgic_model;
-       u64 val = irq->intid;
-       bool allow_pending = true, is_v2_sgi;
-
-       is_v2_sgi = (vgic_irq_is_sgi(irq->intid) &&
-                    model == KVM_DEV_TYPE_ARM_VGIC_V2);
-
-       if (irq->active) {
-               val |= ICH_LR_ACTIVE_BIT;
-               if (is_v2_sgi)
-                       val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
-               if (vgic_irq_is_multi_sgi(irq)) {
-                       allow_pending = false;
-                       val |= ICH_LR_EOI;
-               }
-       }
-
-       if (irq->hw) {
-               val |= ICH_LR_HW;
-               val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
-               /*
-                * Never set pending+active on a HW interrupt, as the
-                * pending state is kept at the physical distributor
-                * level.
-                */
-               if (irq->active)
-                       allow_pending = false;
-       } else {
-               if (irq->config == VGIC_CONFIG_LEVEL) {
-                       val |= ICH_LR_EOI;
-
-                       /*
-                        * Software resampling doesn't work very well
-                        * if we allow P+A, so let's not do that.
-                        */
-                       if (irq->active)
-                               allow_pending = false;
-               }
-       }
-
-       if (allow_pending && irq_is_pending(irq)) {
-               val |= ICH_LR_PENDING_BIT;
-
-               if (irq->config == VGIC_CONFIG_EDGE)
-                       irq->pending_latch = false;
-
-               if (vgic_irq_is_sgi(irq->intid) &&
-                   model == KVM_DEV_TYPE_ARM_VGIC_V2) {
-                       u32 src = ffs(irq->source);
-
-                       if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
-                                          irq->intid))
-                               return;
-
-                       val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
-                       irq->source &= ~(1 << (src - 1));
-                       if (irq->source) {
-                               irq->pending_latch = true;
-                               val |= ICH_LR_EOI;
-                       }
-               }
-       }
-
-       /*
-        * Level-triggered mapped IRQs are special because we only observe
-        * rising edges as input to the VGIC.  We therefore lower the line
-        * level here, so that we can take new virtual IRQs.  See
-        * vgic_v3_fold_lr_state for more info.
-        */
-       if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
-               irq->line_level = false;
-
-       if (irq->group)
-               val |= ICH_LR_GROUP;
-
-       val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
-
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
-}
-
-void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0;
-}
-
-void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-       u32 model = vcpu->kvm->arch.vgic.vgic_model;
-       u32 vmcr;
-
-       if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
-               vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) &
-                       ICH_VMCR_ACK_CTL_MASK;
-               vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) &
-                       ICH_VMCR_FIQ_EN_MASK;
-       } else {
-               /*
-                * When emulating GICv3 on GICv3 with SRE=1 on the
-                * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
-                */
-               vmcr = ICH_VMCR_FIQ_EN_MASK;
-       }
-
-       vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK;
-       vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
-       vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
-       vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
-       vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
-       vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK;
-       vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK;
-
-       cpu_if->vgic_vmcr = vmcr;
-}
-
-void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-       u32 model = vcpu->kvm->arch.vgic.vgic_model;
-       u32 vmcr;
-
-       vmcr = cpu_if->vgic_vmcr;
-
-       if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
-               vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >>
-                       ICH_VMCR_ACK_CTL_SHIFT;
-               vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >>
-                       ICH_VMCR_FIQ_EN_SHIFT;
-       } else {
-               /*
-                * When emulating GICv3 on GICv3 with SRE=1 on the
-                * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
-                */
-               vmcrp->fiqen = 1;
-               vmcrp->ackctl = 0;
-       }
-
-       vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
-       vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT;
-       vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
-       vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
-       vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
-       vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT;
-       vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT;
-}
-
-#define INITIAL_PENDBASER_VALUE                                                  \
-       (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb)            | \
-       GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner)      | \
-       GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
-
-void vgic_v3_enable(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-       /*
-        * By forcing VMCR to zero, the GIC will restore the binary
-        * points to their reset values. Anything else resets to zero
-        * anyway.
-        */
-       vgic_v3->vgic_vmcr = 0;
-
-       /*
-        * If we are emulating a GICv3, we do it in an non-GICv2-compatible
-        * way, so we force SRE to 1 to demonstrate this to the guest.
-        * Also, we don't support any form of IRQ/FIQ bypass.
-        * This goes with the spec allowing the value to be RAO/WI.
-        */
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-               vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB |
-                                    ICC_SRE_EL1_DFB |
-                                    ICC_SRE_EL1_SRE);
-               vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
-       } else {
-               vgic_v3->vgic_sre = 0;
-       }
-
-       vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 &
-                                          ICH_VTR_ID_BITS_MASK) >>
-                                          ICH_VTR_ID_BITS_SHIFT;
-       vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 &
-                                           ICH_VTR_PRI_BITS_MASK) >>
-                                           ICH_VTR_PRI_BITS_SHIFT) + 1;
-
-       /* Get the show on the road... */
-       vgic_v3->vgic_hcr = ICH_HCR_EN;
-       if (group0_trap)
-               vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
-       if (group1_trap)
-               vgic_v3->vgic_hcr |= ICH_HCR_TALL1;
-       if (common_trap)
-               vgic_v3->vgic_hcr |= ICH_HCR_TC;
-}
-
-int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
-{
-       struct kvm_vcpu *vcpu;
-       int byte_offset, bit_nr;
-       gpa_t pendbase, ptr;
-       bool status;
-       u8 val;
-       int ret;
-       unsigned long flags;
-
-retry:
-       vcpu = irq->target_vcpu;
-       if (!vcpu)
-               return 0;
-
-       pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
-
-       byte_offset = irq->intid / BITS_PER_BYTE;
-       bit_nr = irq->intid % BITS_PER_BYTE;
-       ptr = pendbase + byte_offset;
-
-       ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
-       if (ret)
-               return ret;
-
-       status = val & (1 << bit_nr);
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-       if (irq->target_vcpu != vcpu) {
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               goto retry;
-       }
-       irq->pending_latch = status;
-       vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
-       if (status) {
-               /* clear consumed data */
-               val &= ~(1 << bit_nr);
-               ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
-               if (ret)
-                       return ret;
-       }
-       return 0;
-}
-
-/**
- * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
- * kvm lock and all vcpu lock must be held
- */
-int vgic_v3_save_pending_tables(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct vgic_irq *irq;
-       gpa_t last_ptr = ~(gpa_t)0;
-       int ret;
-       u8 val;
-
-       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
-               int byte_offset, bit_nr;
-               struct kvm_vcpu *vcpu;
-               gpa_t pendbase, ptr;
-               bool stored;
-
-               vcpu = irq->target_vcpu;
-               if (!vcpu)
-                       continue;
-
-               pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
-
-               byte_offset = irq->intid / BITS_PER_BYTE;
-               bit_nr = irq->intid % BITS_PER_BYTE;
-               ptr = pendbase + byte_offset;
-
-               if (ptr != last_ptr) {
-                       ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
-                       if (ret)
-                               return ret;
-                       last_ptr = ptr;
-               }
-
-               stored = val & (1U << bit_nr);
-               if (stored == irq->pending_latch)
-                       continue;
-
-               if (irq->pending_latch)
-                       val |= 1 << bit_nr;
-               else
-                       val &= ~(1 << bit_nr);
-
-               ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
-               if (ret)
-                       return ret;
-       }
-       return 0;
-}
-
-/**
- * vgic_v3_rdist_overlap - check if a region overlaps with any
- * existing redistributor region
- *
- * @kvm: kvm handle
- * @base: base of the region
- * @size: size of region
- *
- * Return: true if there is an overlap
- */
-bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size)
-{
-       struct vgic_dist *d = &kvm->arch.vgic;
-       struct vgic_redist_region *rdreg;
-
-       list_for_each_entry(rdreg, &d->rd_regions, list) {
-               if ((base + size > rdreg->base) &&
-                       (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg)))
-                       return true;
-       }
-       return false;
-}
-
-/*
- * Check for overlapping regions and for regions crossing the end of memory
- * for base addresses which have already been set.
- */
-bool vgic_v3_check_base(struct kvm *kvm)
-{
-       struct vgic_dist *d = &kvm->arch.vgic;
-       struct vgic_redist_region *rdreg;
-
-       if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
-           d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base)
-               return false;
-
-       list_for_each_entry(rdreg, &d->rd_regions, list) {
-               if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) <
-                       rdreg->base)
-                       return false;
-       }
-
-       if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base))
-               return true;
-
-       return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base,
-                                     KVM_VGIC_V3_DIST_SIZE);
-}
-
-/**
- * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one
- * which has free space to put a new rdist region.
- *
- * @rd_regions: redistributor region list head
- *
- * A redistributor regions maps n redistributors, n = region size / (2 x 64kB).
- * Stride between redistributors is 0 and regions are filled in the index order.
- *
- * Return: the redist region handle, if any, that has space to map a new rdist
- * region.
- */
-struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions)
-{
-       struct vgic_redist_region *rdreg;
-
-       list_for_each_entry(rdreg, rd_regions, list) {
-               if (!vgic_v3_redist_region_full(rdreg))
-                       return rdreg;
-       }
-       return NULL;
-}
-
-struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
-                                                          u32 index)
-{
-       struct list_head *rd_regions = &kvm->arch.vgic.rd_regions;
-       struct vgic_redist_region *rdreg;
-
-       list_for_each_entry(rdreg, rd_regions, list) {
-               if (rdreg->index == index)
-                       return rdreg;
-       }
-       return NULL;
-}
-
-
-int vgic_v3_map_resources(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int ret = 0;
-       int c;
-
-       if (vgic_ready(kvm))
-               goto out;
-
-       kvm_for_each_vcpu(c, vcpu, kvm) {
-               struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-               if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) {
-                       kvm_debug("vcpu %d redistributor base not set\n", c);
-                       ret = -ENXIO;
-                       goto out;
-               }
-       }
-
-       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) {
-               kvm_err("Need to set vgic distributor addresses first\n");
-               ret = -ENXIO;
-               goto out;
-       }
-
-       if (!vgic_v3_check_base(kvm)) {
-               kvm_err("VGIC redist and dist frames overlap\n");
-               ret = -EINVAL;
-               goto out;
-       }
-
-       /*
-        * For a VGICv3 we require the userland to explicitly initialize
-        * the VGIC before we need to use it.
-        */
-       if (!vgic_initialized(kvm)) {
-               ret = -EBUSY;
-               goto out;
-       }
-
-       ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3);
-       if (ret) {
-               kvm_err("Unable to register VGICv3 dist MMIO regions\n");
-               goto out;
-       }
-
-       if (kvm_vgic_global_state.has_gicv4_1)
-               vgic_v4_configure_vsgis(kvm);
-       dist->ready = true;
-
-out:
-       return ret;
-}
-
-DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);
-
-static int __init early_group0_trap_cfg(char *buf)
-{
-       return strtobool(buf, &group0_trap);
-}
-early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg);
-
-static int __init early_group1_trap_cfg(char *buf)
-{
-       return strtobool(buf, &group1_trap);
-}
-early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg);
-
-static int __init early_common_trap_cfg(char *buf)
-{
-       return strtobool(buf, &common_trap);
-}
-early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg);
-
-static int __init early_gicv4_enable(char *buf)
-{
-       return strtobool(buf, &gicv4_enable);
-}
-early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
-
-/**
- * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller
- * @info:      pointer to the GIC description
- *
- * Returns 0 if the VGICv3 has been probed successfully, returns an error code
- * otherwise
- */
-int vgic_v3_probe(const struct gic_kvm_info *info)
-{
-       u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2);
-       int ret;
-
-       /*
-        * The ListRegs field is 5 bits, but there is a architectural
-        * maximum of 16 list registers. Just ignore bit 4...
-        */
-       kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
-       kvm_vgic_global_state.can_emulate_gicv2 = false;
-       kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2;
-
-       /* GICv4 support? */
-       if (info->has_v4) {
-               kvm_vgic_global_state.has_gicv4 = gicv4_enable;
-               kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable;
-               kvm_info("GICv4%s support %sabled\n",
-                        kvm_vgic_global_state.has_gicv4_1 ? ".1" : "",
-                        gicv4_enable ? "en" : "dis");
-       }
-
-       if (!info->vcpu.start) {
-               kvm_info("GICv3: no GICV resource entry\n");
-               kvm_vgic_global_state.vcpu_base = 0;
-       } else if (!PAGE_ALIGNED(info->vcpu.start)) {
-               pr_warn("GICV physical address 0x%llx not page aligned\n",
-                       (unsigned long long)info->vcpu.start);
-               kvm_vgic_global_state.vcpu_base = 0;
-       } else {
-               kvm_vgic_global_state.vcpu_base = info->vcpu.start;
-               kvm_vgic_global_state.can_emulate_gicv2 = true;
-               ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-               if (ret) {
-                       kvm_err("Cannot register GICv2 KVM device.\n");
-                       return ret;
-               }
-               kvm_info("vgic-v2@%llx\n", info->vcpu.start);
-       }
-       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
-       if (ret) {
-               kvm_err("Cannot register GICv3 KVM device.\n");
-               kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
-               return ret;
-       }
-
-       if (kvm_vgic_global_state.vcpu_base == 0)
-               kvm_info("disabling GICv2 emulation\n");
-
-#ifdef CONFIG_ARM64
-       if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
-               group0_trap = true;
-               group1_trap = true;
-       }
-#endif
-
-       if (group0_trap || group1_trap || common_trap) {
-               kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n",
-                        group0_trap ? "G0" : "",
-                        group1_trap ? "G1" : "",
-                        common_trap ? "C"  : "");
-               static_branch_enable(&vgic_v3_cpuif_trap);
-       }
-
-       kvm_vgic_global_state.vctrl_base = NULL;
-       kvm_vgic_global_state.type = VGIC_V3;
-       kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
-
-       return 0;
-}
-
-void vgic_v3_load(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-       /*
-        * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
-        * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
-        * VMCR_EL2 save/restore in the world switch.
-        */
-       if (likely(cpu_if->vgic_sre))
-               kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
-
-       kvm_call_hyp(__vgic_v3_restore_aprs, vcpu);
-
-       if (has_vhe())
-               __vgic_v3_activate_traps(vcpu);
-
-       WARN_ON(vgic_v4_load(vcpu));
-}
-
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-       if (likely(cpu_if->vgic_sre))
-               cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr);
-}
-
-void vgic_v3_put(struct kvm_vcpu *vcpu)
-{
-       WARN_ON(vgic_v4_put(vcpu, false));
-
-       vgic_v3_vmcr_sync(vcpu);
-
-       kvm_call_hyp(__vgic_v3_save_aprs, vcpu);
-
-       if (has_vhe())
-               __vgic_v3_deactivate_traps(vcpu);
-}
diff --git a/virt/kvm/arm/vgic/vgic-v4.c b/virt/kvm/arm/vgic/vgic-v4.c
deleted file mode 100644 (file)
index 27ac833..0000000
+++ /dev/null
@@ -1,453 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2017 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/irqdomain.h>
-#include <linux/kvm_host.h>
-#include <linux/irqchip/arm-gic-v3.h>
-
-#include "vgic.h"
-
-/*
- * How KVM uses GICv4 (insert rude comments here):
- *
- * The vgic-v4 layer acts as a bridge between several entities:
- * - The GICv4 ITS representation offered by the ITS driver
- * - VFIO, which is in charge of the PCI endpoint
- * - The virtual ITS, which is the only thing the guest sees
- *
- * The configuration of VLPIs is triggered by a callback from VFIO,
- * instructing KVM that a PCI device has been configured to deliver
- * MSIs to a vITS.
- *
- * kvm_vgic_v4_set_forwarding() is thus called with the routing entry,
- * and this is used to find the corresponding vITS data structures
- * (ITS instance, device, event and irq) using a process that is
- * extremely similar to the injection of an MSI.
- *
- * At this stage, we can link the guest's view of an LPI (uniquely
- * identified by the routing entry) and the host irq, using the GICv4
- * driver mapping operation. Should the mapping succeed, we've then
- * successfully upgraded the guest's LPI to a VLPI. We can then start
- * with updating GICv4's view of the property table and generating an
- * INValidation in order to kickstart the delivery of this VLPI to the
- * guest directly, without software intervention. Well, almost.
- *
- * When the PCI endpoint is deconfigured, this operation is reversed
- * with VFIO calling kvm_vgic_v4_unset_forwarding().
- *
- * Once the VLPI has been mapped, it needs to follow any change the
- * guest performs on its LPI through the vITS. For that, a number of
- * command handlers have hooks to communicate these changes to the HW:
- * - Any invalidation triggers a call to its_prop_update_vlpi()
- * - The INT command results in a irq_set_irqchip_state(), which
- *   generates an INT on the corresponding VLPI.
- * - The CLEAR command results in a irq_set_irqchip_state(), which
- *   generates an CLEAR on the corresponding VLPI.
- * - DISCARD translates into an unmap, similar to a call to
- *   kvm_vgic_v4_unset_forwarding().
- * - MOVI is translated by an update of the existing mapping, changing
- *   the target vcpu, resulting in a VMOVI being generated.
- * - MOVALL is translated by a string of mapping updates (similar to
- *   the handling of MOVI). MOVALL is horrible.
- *
- * Note that a DISCARD/MAPTI sequence emitted from the guest without
- * reprogramming the PCI endpoint after MAPTI does not result in a
- * VLPI being mapped, as there is no callback from VFIO (the guest
- * will get the interrupt via the normal SW injection). Fixing this is
- * not trivial, and requires some horrible messing with the VFIO
- * internals. Not fun. Don't do that.
- *
- * Then there is the scheduling. Each time a vcpu is about to run on a
- * physical CPU, KVM must tell the corresponding redistributor about
- * it. And if we've migrated our vcpu from one CPU to another, we must
- * tell the ITS (so that the messages reach the right redistributor).
- * This is done in two steps: first issue a irq_set_affinity() on the
- * irq corresponding to the vcpu, then call its_make_vpe_resident().
- * You must be in a non-preemptible context. On exit, a call to
- * its_make_vpe_non_resident() tells the redistributor that we're done
- * with the vcpu.
- *
- * Finally, the doorbell handling: Each vcpu is allocated an interrupt
- * which will fire each time a VLPI is made pending whilst the vcpu is
- * not running. Each time the vcpu gets blocked, the doorbell
- * interrupt gets enabled. When the vcpu is unblocked (for whatever
- * reason), the doorbell interrupt is disabled.
- */
-
-#define DB_IRQ_FLAGS   (IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING)
-
-static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info)
-{
-       struct kvm_vcpu *vcpu = info;
-
-       /* We got the message, no need to fire again */
-       if (!kvm_vgic_global_state.has_gicv4_1 &&
-           !irqd_irq_disabled(&irq_to_desc(irq)->irq_data))
-               disable_irq_nosync(irq);
-
-       vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true;
-       kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-       kvm_vcpu_kick(vcpu);
-
-       return IRQ_HANDLED;
-}
-
-static void vgic_v4_sync_sgi_config(struct its_vpe *vpe, struct vgic_irq *irq)
-{
-       vpe->sgi_config[irq->intid].enabled     = irq->enabled;
-       vpe->sgi_config[irq->intid].group       = irq->group;
-       vpe->sgi_config[irq->intid].priority    = irq->priority;
-}
-
-static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu)
-{
-       struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-       int i;
-
-       /*
-        * With GICv4.1, every virtual SGI can be directly injected. So
-        * let's pretend that they are HW interrupts, tied to a host
-        * IRQ. The SGI code will do its magic.
-        */
-       for (i = 0; i < VGIC_NR_SGIS; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
-               struct irq_desc *desc;
-               unsigned long flags;
-               int ret;
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-               if (irq->hw)
-                       goto unlock;
-
-               irq->hw = true;
-               irq->host_irq = irq_find_mapping(vpe->sgi_domain, i);
-
-               /* Transfer the full irq state to the vPE */
-               vgic_v4_sync_sgi_config(vpe, irq);
-               desc = irq_to_desc(irq->host_irq);
-               ret = irq_domain_activate_irq(irq_desc_get_irq_data(desc),
-                                             false);
-               if (!WARN_ON(ret)) {
-                       /* Transfer pending state */
-                       ret = irq_set_irqchip_state(irq->host_irq,
-                                                   IRQCHIP_STATE_PENDING,
-                                                   irq->pending_latch);
-                       WARN_ON(ret);
-                       irq->pending_latch = false;
-               }
-       unlock:
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
-{
-       int i;
-
-       for (i = 0; i < VGIC_NR_SGIS; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
-               struct irq_desc *desc;
-               unsigned long flags;
-               int ret;
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-               if (!irq->hw)
-                       goto unlock;
-
-               irq->hw = false;
-               ret = irq_get_irqchip_state(irq->host_irq,
-                                           IRQCHIP_STATE_PENDING,
-                                           &irq->pending_latch);
-               WARN_ON(ret);
-
-               desc = irq_to_desc(irq->host_irq);
-               irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
-       unlock:
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-}
-
-/* Must be called with the kvm lock held */
-void vgic_v4_configure_vsgis(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int i;
-
-       kvm_arm_halt_guest(kvm);
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (dist->nassgireq)
-                       vgic_v4_enable_vsgis(vcpu);
-               else
-                       vgic_v4_disable_vsgis(vcpu);
-       }
-
-       kvm_arm_resume_guest(kvm);
-}
-
-/**
- * vgic_v4_init - Initialize the GICv4 data structures
- * @kvm:       Pointer to the VM being initialized
- *
- * We may be called each time a vITS is created, or when the
- * vgic is initialized. This relies on kvm->lock to be
- * held. In both cases, the number of vcpus should now be
- * fixed.
- */
-int vgic_v4_init(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int i, nr_vcpus, ret;
-
-       if (!kvm_vgic_global_state.has_gicv4)
-               return 0; /* Nothing to see here... move along. */
-
-       if (dist->its_vm.vpes)
-               return 0;
-
-       nr_vcpus = atomic_read(&kvm->online_vcpus);
-
-       dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes),
-                                   GFP_KERNEL);
-       if (!dist->its_vm.vpes)
-               return -ENOMEM;
-
-       dist->its_vm.nr_vpes = nr_vcpus;
-
-       kvm_for_each_vcpu(i, vcpu, kvm)
-               dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-
-       ret = its_alloc_vcpu_irqs(&dist->its_vm);
-       if (ret < 0) {
-               kvm_err("VPE IRQ allocation failure\n");
-               kfree(dist->its_vm.vpes);
-               dist->its_vm.nr_vpes = 0;
-               dist->its_vm.vpes = NULL;
-               return ret;
-       }
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               int irq = dist->its_vm.vpes[i]->irq;
-               unsigned long irq_flags = DB_IRQ_FLAGS;
-
-               /*
-                * Don't automatically enable the doorbell, as we're
-                * flipping it back and forth when the vcpu gets
-                * blocked. Also disable the lazy disabling, as the
-                * doorbell could kick us out of the guest too
-                * early...
-                *
-                * On GICv4.1, the doorbell is managed in HW and must
-                * be left enabled.
-                */
-               if (kvm_vgic_global_state.has_gicv4_1)
-                       irq_flags &= ~IRQ_NOAUTOEN;
-               irq_set_status_flags(irq, irq_flags);
-
-               ret = request_irq(irq, vgic_v4_doorbell_handler,
-                                 0, "vcpu", vcpu);
-               if (ret) {
-                       kvm_err("failed to allocate vcpu IRQ%d\n", irq);
-                       /*
-                        * Trick: adjust the number of vpes so we know
-                        * how many to nuke on teardown...
-                        */
-                       dist->its_vm.nr_vpes = i;
-                       break;
-               }
-       }
-
-       if (ret)
-               vgic_v4_teardown(kvm);
-
-       return ret;
-}
-
-/**
- * vgic_v4_teardown - Free the GICv4 data structures
- * @kvm:       Pointer to the VM being destroyed
- *
- * Relies on kvm->lock to be held.
- */
-void vgic_v4_teardown(struct kvm *kvm)
-{
-       struct its_vm *its_vm = &kvm->arch.vgic.its_vm;
-       int i;
-
-       if (!its_vm->vpes)
-               return;
-
-       for (i = 0; i < its_vm->nr_vpes; i++) {
-               struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i);
-               int irq = its_vm->vpes[i]->irq;
-
-               irq_clear_status_flags(irq, DB_IRQ_FLAGS);
-               free_irq(irq, vcpu);
-       }
-
-       its_free_vcpu_irqs(its_vm);
-       kfree(its_vm->vpes);
-       its_vm->nr_vpes = 0;
-       its_vm->vpes = NULL;
-}
-
-int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db)
-{
-       struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-
-       if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
-               return 0;
-
-       return its_make_vpe_non_resident(vpe, need_db);
-}
-
-int vgic_v4_load(struct kvm_vcpu *vcpu)
-{
-       struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-       int err;
-
-       if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
-               return 0;
-
-       /*
-        * Before making the VPE resident, make sure the redistributor
-        * corresponding to our current CPU expects us here. See the
-        * doc in drivers/irqchip/irq-gic-v4.c to understand how this
-        * turns into a VMOVP command at the ITS level.
-        */
-       err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id()));
-       if (err)
-               return err;
-
-       err = its_make_vpe_resident(vpe, false, vcpu->kvm->arch.vgic.enabled);
-       if (err)
-               return err;
-
-       /*
-        * Now that the VPE is resident, let's get rid of a potential
-        * doorbell interrupt that would still be pending. This is a
-        * GICv4.0 only "feature"...
-        */
-       if (!kvm_vgic_global_state.has_gicv4_1)
-               err = irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false);
-
-       return err;
-}
-
-static struct vgic_its *vgic_get_its(struct kvm *kvm,
-                                    struct kvm_kernel_irq_routing_entry *irq_entry)
-{
-       struct kvm_msi msi  = (struct kvm_msi) {
-               .address_lo     = irq_entry->msi.address_lo,
-               .address_hi     = irq_entry->msi.address_hi,
-               .data           = irq_entry->msi.data,
-               .flags          = irq_entry->msi.flags,
-               .devid          = irq_entry->msi.devid,
-       };
-
-       return vgic_msi_to_its(kvm, &msi);
-}
-
-int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
-                              struct kvm_kernel_irq_routing_entry *irq_entry)
-{
-       struct vgic_its *its;
-       struct vgic_irq *irq;
-       struct its_vlpi_map map;
-       int ret;
-
-       if (!vgic_supports_direct_msis(kvm))
-               return 0;
-
-       /*
-        * Get the ITS, and escape early on error (not a valid
-        * doorbell for any of our vITSs).
-        */
-       its = vgic_get_its(kvm, irq_entry);
-       if (IS_ERR(its))
-               return 0;
-
-       mutex_lock(&its->its_lock);
-
-       /* Perform the actual DevID/EventID -> LPI translation. */
-       ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
-                                  irq_entry->msi.data, &irq);
-       if (ret)
-               goto out;
-
-       /*
-        * Emit the mapping request. If it fails, the ITS probably
-        * isn't v4 compatible, so let's silently bail out. Holding
-        * the ITS lock should ensure that nothing can modify the
-        * target vcpu.
-        */
-       map = (struct its_vlpi_map) {
-               .vm             = &kvm->arch.vgic.its_vm,
-               .vpe            = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe,
-               .vintid         = irq->intid,
-               .properties     = ((irq->priority & 0xfc) |
-                                  (irq->enabled ? LPI_PROP_ENABLED : 0) |
-                                  LPI_PROP_GROUP1),
-               .db_enabled     = true,
-       };
-
-       ret = its_map_vlpi(virq, &map);
-       if (ret)
-               goto out;
-
-       irq->hw         = true;
-       irq->host_irq   = virq;
-       atomic_inc(&map.vpe->vlpi_count);
-
-out:
-       mutex_unlock(&its->its_lock);
-       return ret;
-}
-
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
-                                struct kvm_kernel_irq_routing_entry *irq_entry)
-{
-       struct vgic_its *its;
-       struct vgic_irq *irq;
-       int ret;
-
-       if (!vgic_supports_direct_msis(kvm))
-               return 0;
-
-       /*
-        * Get the ITS, and escape early on error (not a valid
-        * doorbell for any of our vITSs).
-        */
-       its = vgic_get_its(kvm, irq_entry);
-       if (IS_ERR(its))
-               return 0;
-
-       mutex_lock(&its->its_lock);
-
-       ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
-                                  irq_entry->msi.data, &irq);
-       if (ret)
-               goto out;
-
-       WARN_ON(!(irq->hw && irq->host_irq == virq));
-       if (irq->hw) {
-               atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
-               irq->hw = false;
-               ret = its_unmap_vlpi(virq);
-       }
-
-out:
-       mutex_unlock(&its->its_lock);
-       return ret;
-}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
deleted file mode 100644 (file)
index 99b02ca..0000000
+++ /dev/null
@@ -1,1011 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/list_sort.h>
-#include <linux/nospec.h>
-
-#include <asm/kvm_hyp.h>
-
-#include "vgic.h"
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-struct vgic_global kvm_vgic_global_state __ro_after_init = {
-       .gicv3_cpuif = STATIC_KEY_FALSE_INIT,
-};
-
-/*
- * Locking order is always:
- * kvm->lock (mutex)
- *   its->cmd_lock (mutex)
- *     its->its_lock (mutex)
- *       vgic_cpu->ap_list_lock                must be taken with IRQs disabled
- *         kvm->lpi_list_lock          must be taken with IRQs disabled
- *           vgic_irq->irq_lock                must be taken with IRQs disabled
- *
- * As the ap_list_lock might be taken from the timer interrupt handler,
- * we have to disable IRQs before taking this lock and everything lower
- * than it.
- *
- * If you need to take multiple locks, always take the upper lock first,
- * then the lower ones, e.g. first take the its_lock, then the irq_lock.
- * If you are already holding a lock and need to take a higher one, you
- * have to drop the lower ranking lock first and re-aquire it after having
- * taken the upper one.
- *
- * When taking more than one ap_list_lock at the same time, always take the
- * lowest numbered VCPU's ap_list_lock first, so:
- *   vcpuX->vcpu_id < vcpuY->vcpu_id:
- *     raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
- *     raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
- *
- * Since the VGIC must support injecting virtual interrupts from ISRs, we have
- * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer
- * spinlocks for any lock that may be taken while injecting an interrupt.
- */
-
-/*
- * Iterate over the VM's list of mapped LPIs to find the one with a
- * matching interrupt ID and return a reference to the IRQ structure.
- */
-static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct vgic_irq *irq = NULL;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
-               if (irq->intid != intid)
-                       continue;
-
-               /*
-                * This increases the refcount, the caller is expected to
-                * call vgic_put_irq() later once it's finished with the IRQ.
-                */
-               vgic_get_irq_kref(irq);
-               goto out_unlock;
-       }
-       irq = NULL;
-
-out_unlock:
-       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-       return irq;
-}
-
-/*
- * This looks up the virtual interrupt ID to get the corresponding
- * struct vgic_irq. It also increases the refcount, so any caller is expected
- * to call vgic_put_irq() once it's finished with this IRQ.
- */
-struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
-                             u32 intid)
-{
-       /* SGIs and PPIs */
-       if (intid <= VGIC_MAX_PRIVATE) {
-               intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1);
-               return &vcpu->arch.vgic_cpu.private_irqs[intid];
-       }
-
-       /* SPIs */
-       if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
-               intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
-               return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
-       }
-
-       /* LPIs */
-       if (intid >= VGIC_MIN_LPI)
-               return vgic_get_lpi(kvm, intid);
-
-       WARN(1, "Looking up struct vgic_irq for reserved INTID");
-       return NULL;
-}
-
-/*
- * We can't do anything in here, because we lack the kvm pointer to
- * lock and remove the item from the lpi_list. So we keep this function
- * empty and use the return value of kref_put() to trigger the freeing.
- */
-static void vgic_irq_release(struct kref *ref)
-{
-}
-
-/*
- * Drop the refcount on the LPI. Must be called with lpi_list_lock held.
- */
-void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       if (!kref_put(&irq->refcount, vgic_irq_release))
-               return;
-
-       list_del(&irq->lpi_list);
-       dist->lpi_list_count--;
-
-       kfree(irq);
-}
-
-void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       unsigned long flags;
-
-       if (irq->intid < VGIC_MIN_LPI)
-               return;
-
-       raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-       __vgic_put_lpi_locked(kvm, irq);
-       raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-}
-
-void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_irq *irq, *tmp;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
-
-       list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
-               if (irq->intid >= VGIC_MIN_LPI) {
-                       raw_spin_lock(&irq->irq_lock);
-                       list_del(&irq->ap_list);
-                       irq->vcpu = NULL;
-                       raw_spin_unlock(&irq->irq_lock);
-                       vgic_put_irq(vcpu->kvm, irq);
-               }
-       }
-
-       raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
-}
-
-void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
-{
-       WARN_ON(irq_set_irqchip_state(irq->host_irq,
-                                     IRQCHIP_STATE_PENDING,
-                                     pending));
-}
-
-bool vgic_get_phys_line_level(struct vgic_irq *irq)
-{
-       bool line_level;
-
-       BUG_ON(!irq->hw);
-
-       if (irq->get_input_level)
-               return irq->get_input_level(irq->intid);
-
-       WARN_ON(irq_get_irqchip_state(irq->host_irq,
-                                     IRQCHIP_STATE_PENDING,
-                                     &line_level));
-       return line_level;
-}
-
-/* Set/Clear the physical active state */
-void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
-{
-
-       BUG_ON(!irq->hw);
-       WARN_ON(irq_set_irqchip_state(irq->host_irq,
-                                     IRQCHIP_STATE_ACTIVE,
-                                     active));
-}
-
-/**
- * kvm_vgic_target_oracle - compute the target vcpu for an irq
- *
- * @irq:       The irq to route. Must be already locked.
- *
- * Based on the current state of the interrupt (enabled, pending,
- * active, vcpu and target_vcpu), compute the next vcpu this should be
- * given to. Return NULL if this shouldn't be injected at all.
- *
- * Requires the IRQ lock to be held.
- */
-static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
-{
-       lockdep_assert_held(&irq->irq_lock);
-
-       /* If the interrupt is active, it must stay on the current vcpu */
-       if (irq->active)
-               return irq->vcpu ? : irq->target_vcpu;
-
-       /*
-        * If the IRQ is not active but enabled and pending, we should direct
-        * it to its configured target VCPU.
-        * If the distributor is disabled, pending interrupts shouldn't be
-        * forwarded.
-        */
-       if (irq->enabled && irq_is_pending(irq)) {
-               if (unlikely(irq->target_vcpu &&
-                            !irq->target_vcpu->kvm->arch.vgic.enabled))
-                       return NULL;
-
-               return irq->target_vcpu;
-       }
-
-       /* If neither active nor pending and enabled, then this IRQ should not
-        * be queued to any VCPU.
-        */
-       return NULL;
-}
-
-/*
- * The order of items in the ap_lists defines how we'll pack things in LRs as
- * well, the first items in the list being the first things populated in the
- * LRs.
- *
- * A hard rule is that active interrupts can never be pushed out of the LRs
- * (and therefore take priority) since we cannot reliably trap on deactivation
- * of IRQs and therefore they have to be present in the LRs.
- *
- * Otherwise things should be sorted by the priority field and the GIC
- * hardware support will take care of preemption of priority groups etc.
- *
- * Return negative if "a" sorts before "b", 0 to preserve order, and positive
- * to sort "b" before "a".
- */
-static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
-{
-       struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
-       struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
-       bool penda, pendb;
-       int ret;
-
-       /*
-        * list_sort may call this function with the same element when
-        * the list is fairly long.
-        */
-       if (unlikely(irqa == irqb))
-               return 0;
-
-       raw_spin_lock(&irqa->irq_lock);
-       raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
-
-       if (irqa->active || irqb->active) {
-               ret = (int)irqb->active - (int)irqa->active;
-               goto out;
-       }
-
-       penda = irqa->enabled && irq_is_pending(irqa);
-       pendb = irqb->enabled && irq_is_pending(irqb);
-
-       if (!penda || !pendb) {
-               ret = (int)pendb - (int)penda;
-               goto out;
-       }
-
-       /* Both pending and enabled, sort by priority */
-       ret = irqa->priority - irqb->priority;
-out:
-       raw_spin_unlock(&irqb->irq_lock);
-       raw_spin_unlock(&irqa->irq_lock);
-       return ret;
-}
-
-/* Must be called with the ap_list_lock held */
-static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-       lockdep_assert_held(&vgic_cpu->ap_list_lock);
-
-       list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
-}
-
-/*
- * Only valid injection if changing level for level-triggered IRQs or for a
- * rising edge, and in-kernel connected IRQ lines can only be controlled by
- * their owner.
- */
-static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner)
-{
-       if (irq->owner != owner)
-               return false;
-
-       switch (irq->config) {
-       case VGIC_CONFIG_LEVEL:
-               return irq->line_level != level;
-       case VGIC_CONFIG_EDGE:
-               return level;
-       }
-
-       return false;
-}
-
-/*
- * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
- * Do the queuing if necessary, taking the right locks in the right order.
- * Returns true when the IRQ was queued, false otherwise.
- *
- * Needs to be entered with the IRQ lock already held, but will return
- * with all locks dropped.
- */
-bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
-                          unsigned long flags)
-{
-       struct kvm_vcpu *vcpu;
-
-       lockdep_assert_held(&irq->irq_lock);
-
-retry:
-       vcpu = vgic_target_oracle(irq);
-       if (irq->vcpu || !vcpu) {
-               /*
-                * If this IRQ is already on a VCPU's ap_list, then it
-                * cannot be moved or modified and there is no more work for
-                * us to do.
-                *
-                * Otherwise, if the irq is not pending and enabled, it does
-                * not need to be inserted into an ap_list and there is also
-                * no more work for us to do.
-                */
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-               /*
-                * We have to kick the VCPU here, because we could be
-                * queueing an edge-triggered interrupt for which we
-                * get no EOI maintenance interrupt. In that case,
-                * while the IRQ is already on the VCPU's AP list, the
-                * VCPU could have EOI'ed the original interrupt and
-                * won't see this one until it exits for some other
-                * reason.
-                */
-               if (vcpu) {
-                       kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-                       kvm_vcpu_kick(vcpu);
-               }
-               return false;
-       }
-
-       /*
-        * We must unlock the irq lock to take the ap_list_lock where
-        * we are going to insert this new pending interrupt.
-        */
-       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-       /* someone can do stuff here, which we re-check below */
-
-       raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
-       raw_spin_lock(&irq->irq_lock);
-
-       /*
-        * Did something change behind our backs?
-        *
-        * There are two cases:
-        * 1) The irq lost its pending state or was disabled behind our
-        *    backs and/or it was queued to another VCPU's ap_list.
-        * 2) Someone changed the affinity on this irq behind our
-        *    backs and we are now holding the wrong ap_list_lock.
-        *
-        * In both cases, drop the locks and retry.
-        */
-
-       if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) {
-               raw_spin_unlock(&irq->irq_lock);
-               raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock,
-                                          flags);
-
-               raw_spin_lock_irqsave(&irq->irq_lock, flags);
-               goto retry;
-       }
-
-       /*
-        * Grab a reference to the irq to reflect the fact that it is
-        * now in the ap_list.
-        */
-       vgic_get_irq_kref(irq);
-       list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
-       irq->vcpu = vcpu;
-
-       raw_spin_unlock(&irq->irq_lock);
-       raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
-
-       kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-       kvm_vcpu_kick(vcpu);
-
-       return true;
-}
-
-/**
- * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
- * @kvm:     The VM structure pointer
- * @cpuid:   The CPU for PPIs
- * @intid:   The INTID to inject a new state to.
- * @level:   Edge-triggered:  true:  to trigger the interrupt
- *                           false: to ignore the call
- *          Level-sensitive  true:  raise the input signal
- *                           false: lower the input signal
- * @owner:   The opaque pointer to the owner of the IRQ being raised to verify
- *           that the caller is allowed to inject this IRQ.  Userspace
- *           injections will have owner == NULL.
- *
- * The VGIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts.  You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-                       bool level, void *owner)
-{
-       struct kvm_vcpu *vcpu;
-       struct vgic_irq *irq;
-       unsigned long flags;
-       int ret;
-
-       trace_vgic_update_irq_pending(cpuid, intid, level);
-
-       ret = vgic_lazy_init(kvm);
-       if (ret)
-               return ret;
-
-       vcpu = kvm_get_vcpu(kvm, cpuid);
-       if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
-               return -EINVAL;
-
-       irq = vgic_get_irq(kvm, vcpu, intid);
-       if (!irq)
-               return -EINVAL;
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-       if (!vgic_validate_injection(irq, level, owner)) {
-               /* Nothing to see here, move along... */
-               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-               vgic_put_irq(kvm, irq);
-               return 0;
-       }
-
-       if (irq->config == VGIC_CONFIG_LEVEL)
-               irq->line_level = level;
-       else
-               irq->pending_latch = true;
-
-       vgic_queue_irq_unlock(kvm, irq, flags);
-       vgic_put_irq(kvm, irq);
-
-       return 0;
-}
-
-/* @irq->irq_lock must be held */
-static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-                           unsigned int host_irq,
-                           bool (*get_input_level)(int vindid))
-{
-       struct irq_desc *desc;
-       struct irq_data *data;
-
-       /*
-        * Find the physical IRQ number corresponding to @host_irq
-        */
-       desc = irq_to_desc(host_irq);
-       if (!desc) {
-               kvm_err("%s: no interrupt descriptor\n", __func__);
-               return -EINVAL;
-       }
-       data = irq_desc_get_irq_data(desc);
-       while (data->parent_data)
-               data = data->parent_data;
-
-       irq->hw = true;
-       irq->host_irq = host_irq;
-       irq->hwintid = data->hwirq;
-       irq->get_input_level = get_input_level;
-       return 0;
-}
-
-/* @irq->irq_lock must be held */
-static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
-{
-       irq->hw = false;
-       irq->hwintid = 0;
-       irq->get_input_level = NULL;
-}
-
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
-                         u32 vintid, bool (*get_input_level)(int vindid))
-{
-       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-       unsigned long flags;
-       int ret;
-
-       BUG_ON(!irq);
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-       ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
-       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-       vgic_put_irq(vcpu->kvm, irq);
-
-       return ret;
-}
-
-/**
- * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ
- * @vcpu: The VCPU pointer
- * @vintid: The INTID of the interrupt
- *
- * Reset the active and pending states of a mapped interrupt.  Kernel
- * subsystems injecting mapped interrupts should reset their interrupt lines
- * when we are doing a reset of the VM.
- */
-void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid)
-{
-       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-       unsigned long flags;
-
-       if (!irq->hw)
-               goto out;
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-       irq->active = false;
-       irq->pending_latch = false;
-       irq->line_level = false;
-       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-out:
-       vgic_put_irq(vcpu->kvm, irq);
-}
-
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
-{
-       struct vgic_irq *irq;
-       unsigned long flags;
-
-       if (!vgic_initialized(vcpu->kvm))
-               return -EAGAIN;
-
-       irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-       BUG_ON(!irq);
-
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-       kvm_vgic_unmap_irq(irq);
-       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-       vgic_put_irq(vcpu->kvm, irq);
-
-       return 0;
-}
-
-/**
- * kvm_vgic_set_owner - Set the owner of an interrupt for a VM
- *
- * @vcpu:   Pointer to the VCPU (used for PPIs)
- * @intid:  The virtual INTID identifying the interrupt (PPI or SPI)
- * @owner:  Opaque pointer to the owner
- *
- * Returns 0 if intid is not already used by another in-kernel device and the
- * owner is set, otherwise returns an error code.
- */
-int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
-{
-       struct vgic_irq *irq;
-       unsigned long flags;
-       int ret = 0;
-
-       if (!vgic_initialized(vcpu->kvm))
-               return -EAGAIN;
-
-       /* SGIs and LPIs cannot be wired up to any device */
-       if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
-               return -EINVAL;
-
-       irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-       if (irq->owner && irq->owner != owner)
-               ret = -EEXIST;
-       else
-               irq->owner = owner;
-       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-       return ret;
-}
-
-/**
- * vgic_prune_ap_list - Remove non-relevant interrupts from the list
- *
- * @vcpu: The VCPU pointer
- *
- * Go over the list of "interesting" interrupts, and prune those that we
- * won't have to consider in the near future.
- */
-static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_irq *irq, *tmp;
-
-       DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-retry:
-       raw_spin_lock(&vgic_cpu->ap_list_lock);
-
-       list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
-               struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
-               bool target_vcpu_needs_kick = false;
-
-               raw_spin_lock(&irq->irq_lock);
-
-               BUG_ON(vcpu != irq->vcpu);
-
-               target_vcpu = vgic_target_oracle(irq);
-
-               if (!target_vcpu) {
-                       /*
-                        * We don't need to process this interrupt any
-                        * further, move it off the list.
-                        */
-                       list_del(&irq->ap_list);
-                       irq->vcpu = NULL;
-                       raw_spin_unlock(&irq->irq_lock);
-
-                       /*
-                        * This vgic_put_irq call matches the
-                        * vgic_get_irq_kref in vgic_queue_irq_unlock,
-                        * where we added the LPI to the ap_list. As
-                        * we remove the irq from the list, we drop
-                        * also drop the refcount.
-                        */
-                       vgic_put_irq(vcpu->kvm, irq);
-                       continue;
-               }
-
-               if (target_vcpu == vcpu) {
-                       /* We're on the right CPU */
-                       raw_spin_unlock(&irq->irq_lock);
-                       continue;
-               }
-
-               /* This interrupt looks like it has to be migrated. */
-
-               raw_spin_unlock(&irq->irq_lock);
-               raw_spin_unlock(&vgic_cpu->ap_list_lock);
-
-               /*
-                * Ensure locking order by always locking the smallest
-                * ID first.
-                */
-               if (vcpu->vcpu_id < target_vcpu->vcpu_id) {
-                       vcpuA = vcpu;
-                       vcpuB = target_vcpu;
-               } else {
-                       vcpuA = target_vcpu;
-                       vcpuB = vcpu;
-               }
-
-               raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock);
-               raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
-                                     SINGLE_DEPTH_NESTING);
-               raw_spin_lock(&irq->irq_lock);
-
-               /*
-                * If the affinity has been preserved, move the
-                * interrupt around. Otherwise, it means things have
-                * changed while the interrupt was unlocked, and we
-                * need to replay this.
-                *
-                * In all cases, we cannot trust the list not to have
-                * changed, so we restart from the beginning.
-                */
-               if (target_vcpu == vgic_target_oracle(irq)) {
-                       struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;
-
-                       list_del(&irq->ap_list);
-                       irq->vcpu = target_vcpu;
-                       list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
-                       target_vcpu_needs_kick = true;
-               }
-
-               raw_spin_unlock(&irq->irq_lock);
-               raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
-               raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock);
-
-               if (target_vcpu_needs_kick) {
-                       kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu);
-                       kvm_vcpu_kick(target_vcpu);
-               }
-
-               goto retry;
-       }
-
-       raw_spin_unlock(&vgic_cpu->ap_list_lock);
-}
-
-static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-       if (kvm_vgic_global_state.type == VGIC_V2)
-               vgic_v2_fold_lr_state(vcpu);
-       else
-               vgic_v3_fold_lr_state(vcpu);
-}
-
-/* Requires the irq_lock to be held. */
-static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
-                                   struct vgic_irq *irq, int lr)
-{
-       lockdep_assert_held(&irq->irq_lock);
-
-       if (kvm_vgic_global_state.type == VGIC_V2)
-               vgic_v2_populate_lr(vcpu, irq, lr);
-       else
-               vgic_v3_populate_lr(vcpu, irq, lr);
-}
-
-static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-       if (kvm_vgic_global_state.type == VGIC_V2)
-               vgic_v2_clear_lr(vcpu, lr);
-       else
-               vgic_v3_clear_lr(vcpu, lr);
-}
-
-static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
-{
-       if (kvm_vgic_global_state.type == VGIC_V2)
-               vgic_v2_set_underflow(vcpu);
-       else
-               vgic_v3_set_underflow(vcpu);
-}
-
-/* Requires the ap_list_lock to be held. */
-static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
-                                bool *multi_sgi)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_irq *irq;
-       int count = 0;
-
-       *multi_sgi = false;
-
-       lockdep_assert_held(&vgic_cpu->ap_list_lock);
-
-       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
-               int w;
-
-               raw_spin_lock(&irq->irq_lock);
-               /* GICv2 SGIs can count for more than one... */
-               w = vgic_irq_get_lr_count(irq);
-               raw_spin_unlock(&irq->irq_lock);
-
-               count += w;
-               *multi_sgi |= (w > 1);
-       }
-       return count;
-}
-
-/* Requires the VCPU's ap_list_lock to be held. */
-static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_irq *irq;
-       int count;
-       bool multi_sgi;
-       u8 prio = 0xff;
-
-       lockdep_assert_held(&vgic_cpu->ap_list_lock);
-
-       count = compute_ap_list_depth(vcpu, &multi_sgi);
-       if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
-               vgic_sort_ap_list(vcpu);
-
-       count = 0;
-
-       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
-               raw_spin_lock(&irq->irq_lock);
-
-               /*
-                * If we have multi-SGIs in the pipeline, we need to
-                * guarantee that they are all seen before any IRQ of
-                * lower priority. In that case, we need to filter out
-                * these interrupts by exiting early. This is easy as
-                * the AP list has been sorted already.
-                */
-               if (multi_sgi && irq->priority > prio) {
-                       _raw_spin_unlock(&irq->irq_lock);
-                       break;
-               }
-
-               if (likely(vgic_target_oracle(irq) == vcpu)) {
-                       vgic_populate_lr(vcpu, irq, count++);
-
-                       if (irq->source)
-                               prio = irq->priority;
-               }
-
-               raw_spin_unlock(&irq->irq_lock);
-
-               if (count == kvm_vgic_global_state.nr_lr) {
-                       if (!list_is_last(&irq->ap_list,
-                                         &vgic_cpu->ap_list_head))
-                               vgic_set_underflow(vcpu);
-                       break;
-               }
-       }
-
-       vcpu->arch.vgic_cpu.used_lrs = count;
-
-       /* Nuke remaining LRs */
-       for ( ; count < kvm_vgic_global_state.nr_lr; count++)
-               vgic_clear_lr(vcpu, count);
-}
-
-static inline bool can_access_vgic_from_kernel(void)
-{
-       /*
-        * GICv2 can always be accessed from the kernel because it is
-        * memory-mapped, and VHE systems can access GICv3 EL2 system
-        * registers.
-        */
-       return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe();
-}
-
-static inline void vgic_save_state(struct kvm_vcpu *vcpu)
-{
-       if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
-               vgic_v2_save_state(vcpu);
-       else
-               __vgic_v3_save_state(vcpu);
-}
-
-/* Sync back the hardware VGIC state into our emulation after a guest's run. */
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-       /* An empty ap_list_head implies used_lrs == 0 */
-       if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
-               return;
-
-       if (can_access_vgic_from_kernel())
-               vgic_save_state(vcpu);
-
-       if (vgic_cpu->used_lrs)
-               vgic_fold_lr_state(vcpu);
-       vgic_prune_ap_list(vcpu);
-}
-
-static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
-{
-       if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
-               vgic_v2_restore_state(vcpu);
-       else
-               __vgic_v3_restore_state(vcpu);
-}
-
-/* Flush our emulation state into the GIC hardware before entering the guest. */
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-       /*
-        * If there are no virtual interrupts active or pending for this
-        * VCPU, then there is no work to do and we can bail out without
-        * taking any lock.  There is a potential race with someone injecting
-        * interrupts to the VCPU, but it is a benign race as the VCPU will
-        * either observe the new interrupt before or after doing this check,
-        * and introducing additional synchronization mechanism doesn't change
-        * this.
-        *
-        * Note that we still need to go through the whole thing if anything
-        * can be directly injected (GICv4).
-        */
-       if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
-           !vgic_supports_direct_msis(vcpu->kvm))
-               return;
-
-       DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-       if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
-               raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
-               vgic_flush_lr_state(vcpu);
-               raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
-       }
-
-       if (can_access_vgic_from_kernel())
-               vgic_restore_state(vcpu);
-}
-
-void kvm_vgic_load(struct kvm_vcpu *vcpu)
-{
-       if (unlikely(!vgic_initialized(vcpu->kvm)))
-               return;
-
-       if (kvm_vgic_global_state.type == VGIC_V2)
-               vgic_v2_load(vcpu);
-       else
-               vgic_v3_load(vcpu);
-}
-
-void kvm_vgic_put(struct kvm_vcpu *vcpu)
-{
-       if (unlikely(!vgic_initialized(vcpu->kvm)))
-               return;
-
-       if (kvm_vgic_global_state.type == VGIC_V2)
-               vgic_v2_put(vcpu);
-       else
-               vgic_v3_put(vcpu);
-}
-
-void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu)
-{
-       if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
-               return;
-
-       if (kvm_vgic_global_state.type == VGIC_V2)
-               vgic_v2_vmcr_sync(vcpu);
-       else
-               vgic_v3_vmcr_sync(vcpu);
-}
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_irq *irq;
-       bool pending = false;
-       unsigned long flags;
-       struct vgic_vmcr vmcr;
-
-       if (!vcpu->kvm->arch.vgic.enabled)
-               return false;
-
-       if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last)
-               return true;
-
-       vgic_get_vmcr(vcpu, &vmcr);
-
-       raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
-
-       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
-               raw_spin_lock(&irq->irq_lock);
-               pending = irq_is_pending(irq) && irq->enabled &&
-                         !irq->active &&
-                         irq->priority < vmcr.pmr;
-               raw_spin_unlock(&irq->irq_lock);
-
-               if (pending)
-                       break;
-       }
-
-       raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
-
-       return pending;
-}
-
-void vgic_kick_vcpus(struct kvm *kvm)
-{
-       struct kvm_vcpu *vcpu;
-       int c;
-
-       /*
-        * We've injected an interrupt, time to find out who deserves
-        * a good kick...
-        */
-       kvm_for_each_vcpu(c, vcpu, kvm) {
-               if (kvm_vgic_vcpu_pending_irq(vcpu)) {
-                       kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-                       kvm_vcpu_kick(vcpu);
-               }
-       }
-}
-
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
-{
-       struct vgic_irq *irq;
-       bool map_is_active;
-       unsigned long flags;
-
-       if (!vgic_initialized(vcpu->kvm))
-               return false;
-
-       irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-       raw_spin_lock_irqsave(&irq->irq_lock, flags);
-       map_is_active = irq->hw && irq->active;
-       raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-       vgic_put_irq(vcpu->kvm, irq);
-
-       return map_is_active;
-}
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
deleted file mode 100644 (file)
index 769e480..0000000
+++ /dev/null
@@ -1,321 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-#ifndef __KVM_ARM_VGIC_NEW_H__
-#define __KVM_ARM_VGIC_NEW_H__
-
-#include <linux/irqchip/arm-gic-common.h>
-
-#define PRODUCT_ID_KVM         0x4b    /* ASCII code K */
-#define IMPLEMENTER_ARM                0x43b
-
-#define VGIC_ADDR_UNDEF                (-1)
-#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
-
-#define INTERRUPT_ID_BITS_SPIS 10
-#define INTERRUPT_ID_BITS_ITS  16
-#define VGIC_PRI_BITS          5
-
-#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
-
-#define VGIC_AFFINITY_0_SHIFT 0
-#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT)
-#define VGIC_AFFINITY_1_SHIFT 8
-#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT)
-#define VGIC_AFFINITY_2_SHIFT 16
-#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT)
-#define VGIC_AFFINITY_3_SHIFT 24
-#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT)
-
-#define VGIC_AFFINITY_LEVEL(reg, level) \
-       ((((reg) & VGIC_AFFINITY_## level ##_MASK) \
-       >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
-
-/*
- * The Userspace encodes the affinity differently from the MPIDR,
- * Below macro converts vgic userspace format to MPIDR reg format.
- */
-#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \
-                           VGIC_AFFINITY_LEVEL(val, 1) | \
-                           VGIC_AFFINITY_LEVEL(val, 2) | \
-                           VGIC_AFFINITY_LEVEL(val, 3))
-
-/*
- * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt,
- * below macros are defined for CPUREG encoding.
- */
-#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK   0x000000000000c000
-#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT  14
-#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK   0x0000000000003800
-#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT  11
-#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK   0x0000000000000780
-#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT  7
-#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK   0x0000000000000078
-#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT  3
-#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK   0x0000000000000007
-#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT  0
-
-#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \
-                                     KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \
-                                     KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \
-                                     KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \
-                                     KVM_REG_ARM_VGIC_SYSREG_OP2_MASK)
-
-/*
- * As per Documentation/virt/kvm/devices/arm-vgic-its.txt,
- * below macros are defined for ITS table entry encoding.
- */
-#define KVM_ITS_CTE_VALID_SHIFT                63
-#define KVM_ITS_CTE_VALID_MASK         BIT_ULL(63)
-#define KVM_ITS_CTE_RDBASE_SHIFT       16
-#define KVM_ITS_CTE_ICID_MASK          GENMASK_ULL(15, 0)
-#define KVM_ITS_ITE_NEXT_SHIFT         48
-#define KVM_ITS_ITE_PINTID_SHIFT       16
-#define KVM_ITS_ITE_PINTID_MASK                GENMASK_ULL(47, 16)
-#define KVM_ITS_ITE_ICID_MASK          GENMASK_ULL(15, 0)
-#define KVM_ITS_DTE_VALID_SHIFT                63
-#define KVM_ITS_DTE_VALID_MASK         BIT_ULL(63)
-#define KVM_ITS_DTE_NEXT_SHIFT         49
-#define KVM_ITS_DTE_NEXT_MASK          GENMASK_ULL(62, 49)
-#define KVM_ITS_DTE_ITTADDR_SHIFT      5
-#define KVM_ITS_DTE_ITTADDR_MASK       GENMASK_ULL(48, 5)
-#define KVM_ITS_DTE_SIZE_MASK          GENMASK_ULL(4, 0)
-#define KVM_ITS_L1E_VALID_MASK         BIT_ULL(63)
-/* we only support 64 kB translation table page size */
-#define KVM_ITS_L1E_ADDR_MASK          GENMASK_ULL(51, 16)
-
-#define KVM_VGIC_V3_RDIST_INDEX_MASK   GENMASK_ULL(11, 0)
-#define KVM_VGIC_V3_RDIST_FLAGS_MASK   GENMASK_ULL(15, 12)
-#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT  12
-#define KVM_VGIC_V3_RDIST_BASE_MASK    GENMASK_ULL(51, 16)
-#define KVM_VGIC_V3_RDIST_COUNT_MASK   GENMASK_ULL(63, 52)
-#define KVM_VGIC_V3_RDIST_COUNT_SHIFT  52
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p)
-#else
-#define DEBUG_SPINLOCK_BUG_ON(p)
-#endif
-
-/* Requires the irq_lock to be held by the caller. */
-static inline bool irq_is_pending(struct vgic_irq *irq)
-{
-       if (irq->config == VGIC_CONFIG_EDGE)
-               return irq->pending_latch;
-       else
-               return irq->pending_latch || irq->line_level;
-}
-
-static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq)
-{
-       return irq->config == VGIC_CONFIG_LEVEL && irq->hw;
-}
-
-static inline int vgic_irq_get_lr_count(struct vgic_irq *irq)
-{
-       /* Account for the active state as an interrupt */
-       if (vgic_irq_is_sgi(irq->intid) && irq->source)
-               return hweight8(irq->source) + irq->active;
-
-       return irq_is_pending(irq) || irq->active;
-}
-
-static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq)
-{
-       return vgic_irq_get_lr_count(irq) > 1;
-}
-
-/*
- * This struct provides an intermediate representation of the fields contained
- * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
- * state to userspace can generate either GICv2 or GICv3 CPU interface
- * registers regardless of the hardware backed GIC used.
- */
-struct vgic_vmcr {
-       u32     grpen0;
-       u32     grpen1;
-
-       u32     ackctl;
-       u32     fiqen;
-       u32     cbpr;
-       u32     eoim;
-
-       u32     abpr;
-       u32     bpr;
-       u32     pmr;  /* Priority mask field in the GICC_PMR and
-                      * ICC_PMR_EL1 priority field format */
-};
-
-struct vgic_reg_attr {
-       struct kvm_vcpu *vcpu;
-       gpa_t addr;
-};
-
-int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-                      struct vgic_reg_attr *reg_attr);
-int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-                      struct vgic_reg_attr *reg_attr);
-const struct vgic_register_region *
-vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
-                    gpa_t addr, int len);
-struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
-                             u32 intid);
-void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq);
-void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
-bool vgic_get_phys_line_level(struct vgic_irq *irq);
-void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
-void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
-bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
-                          unsigned long flags);
-void vgic_kick_vcpus(struct kvm *kvm);
-
-int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
-                     phys_addr_t addr, phys_addr_t alignment);
-
-void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
-void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
-void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
-void vgic_v2_set_npie(struct kvm_vcpu *vcpu);
-int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
-int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-                        int offset, u32 *val);
-int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-                         int offset, u32 *val);
-void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v2_enable(struct kvm_vcpu *vcpu);
-int vgic_v2_probe(const struct gic_kvm_info *info);
-int vgic_v2_map_resources(struct kvm *kvm);
-int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
-                            enum vgic_type);
-
-void vgic_v2_init_lrs(void);
-void vgic_v2_load(struct kvm_vcpu *vcpu);
-void vgic_v2_put(struct kvm_vcpu *vcpu);
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu);
-
-void vgic_v2_save_state(struct kvm_vcpu *vcpu);
-void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
-
-static inline void vgic_get_irq_kref(struct vgic_irq *irq)
-{
-       if (irq->intid < VGIC_MIN_LPI)
-               return;
-
-       kref_get(&irq->refcount);
-}
-
-void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
-void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
-void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
-void vgic_v3_set_npie(struct kvm_vcpu *vcpu);
-void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v3_enable(struct kvm_vcpu *vcpu);
-int vgic_v3_probe(const struct gic_kvm_info *info);
-int vgic_v3_map_resources(struct kvm *kvm);
-int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
-int vgic_v3_save_pending_tables(struct kvm *kvm);
-int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count);
-int vgic_register_redist_iodev(struct kvm_vcpu *vcpu);
-bool vgic_v3_check_base(struct kvm *kvm);
-
-void vgic_v3_load(struct kvm_vcpu *vcpu);
-void vgic_v3_put(struct kvm_vcpu *vcpu);
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu);
-
-bool vgic_has_its(struct kvm *kvm);
-int kvm_vgic_register_its_device(void);
-void vgic_enable_lpis(struct kvm_vcpu *vcpu);
-void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu);
-int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
-int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
-int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-                        int offset, u32 *val);
-int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-                        int offset, u32 *val);
-int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-                        u64 id, u64 *val);
-int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
-                               u64 *reg);
-int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-                                   u32 intid, u64 *val);
-int kvm_register_vgic_device(unsigned long type);
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-int vgic_lazy_init(struct kvm *kvm);
-int vgic_init(struct kvm *kvm);
-
-void vgic_debug_init(struct kvm *kvm);
-void vgic_debug_destroy(struct kvm *kvm);
-
-bool lock_all_vcpus(struct kvm *kvm);
-void unlock_all_vcpus(struct kvm *kvm);
-
-static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu;
-
-       /*
-        * num_pri_bits are initialized with HW supported values.
-        * We can rely safely on num_pri_bits even if VM has not
-        * restored ICC_CTLR_EL1 before restoring APnR registers.
-        */
-       switch (cpu_if->num_pri_bits) {
-       case 7: return 3;
-       case 6: return 1;
-       default: return 0;
-       }
-}
-
-static inline bool
-vgic_v3_redist_region_full(struct vgic_redist_region *region)
-{
-       if (!region->count)
-               return false;
-
-       return (region->free_index >= region->count);
-}
-
-struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs);
-
-static inline size_t
-vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
-{
-       if (!rdreg->count)
-               return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE;
-       else
-               return rdreg->count * KVM_VGIC_V3_REDIST_SIZE;
-}
-
-struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
-                                                          u32 index);
-
-bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
-
-static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size)
-{
-       struct vgic_dist *d = &kvm->arch.vgic;
-
-       return (base + size > d->vgic_dist_base) &&
-               (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE);
-}
-
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr);
-int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
-                        u32 devid, u32 eventid, struct vgic_irq **irq);
-struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi);
-int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi);
-void vgic_lpi_translation_cache_init(struct kvm *kvm);
-void vgic_lpi_translation_cache_destroy(struct kvm *kvm);
-void vgic_its_invalidate_cache(struct kvm *kvm);
-
-bool vgic_supports_direct_msis(struct kvm *kvm);
-int vgic_v4_init(struct kvm *kvm);
-void vgic_v4_teardown(struct kvm *kvm);
-void vgic_v4_configure_vsgis(struct kvm *kvm);
-
-#endif