From: Nicholas Piggin Date: Fri, 28 May 2021 09:07:30 +0000 (+1000) Subject: KVM: PPC: Book3S HV P9: Reduce irq_work vs guest decrementer races X-Git-Tag: accepted/tizen/unified/20230118.172025~6813^2~116^2~30 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6ffe2c6e6dcefb971e4046f02086c4adadd0b310;p=platform%2Fkernel%2Flinux-rpi.git KVM: PPC: Book3S HV P9: Reduce irq_work vs guest decrementer races irq_work's use of the DEC SPR is racy with guest<->host switch and guest entry which flips the DEC interrupt to guest, which could lose a host work interrupt. This patch closes one race, and attempts to comment another class of races. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210528090752.3542186-11-npiggin@gmail.com --- diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 8dd3cdb..8c2c3dd 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -97,6 +97,18 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low, extern void secondary_cpu_time_init(void); extern void __init time_init(void); +#ifdef CONFIG_PPC64 +static inline unsigned long test_irq_work_pending(void) +{ + unsigned long x; + + asm volatile("lbz %0,%1(13)" + : "=r" (x) + : "i" (offsetof(struct paca_struct, irq_work_pending))); + return x; +} +#endif + DECLARE_PER_CPU(u64, decrementers_next_tb); /* Convert timebase ticks to nanoseconds */ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index b67d93a..da995c5 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -508,16 +508,6 @@ EXPORT_SYMBOL(profile_pc); * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... */ #ifdef CONFIG_PPC64 -static inline unsigned long test_irq_work_pending(void) -{ - unsigned long x; - - asm volatile("lbz %0,%1(13)" - : "=r" (x) - : "i" (offsetof(struct paca_struct, irq_work_pending))); - return x; -} - static inline void set_irq_work_pending_flag(void) { asm volatile("stb %0,%1(13)" : : diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 466d62b..d82ff7f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3708,6 +3708,18 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, if (!(vcpu->arch.ctrl & 1)) mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); + /* + * When setting DEC, we must always deal with irq_work_raise via NMI vs + * setting DEC. The problem occurs right as we switch into guest mode + * if a NMI hits and sets pending work and sets DEC, then that will + * apply to the guest and not bring us back to the host. + * + * irq_work_raise could check a flag (or possibly LPCR[HDICE] for + * example) and set HDEC to 1? That wouldn't solve the nested hv + * case which needs to abort the hcall or zero the time limit. + * + * XXX: Another day's problem. + */ mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb()); if (kvmhv_on_pseries()) { @@ -3822,6 +3834,9 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->in_guest = 0; mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); + /* We may have raced with new irq work */ + if (test_irq_work_pending()) + set_dec(1); mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); kvmhv_load_host_pmu();