From c68dc1b577eabd5605c6c7c08f3e07ae18d30d5d Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 16 Sep 2021 18:15:35 +0000 Subject: [PATCH] KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK Handling the migration of TSCs correctly is difficult, in part because Linux does not provide userspace with the ability to retrieve a (TSC, realtime) clock pair for a single instant in time. In lieu of a more convenient facility, KVM can report similar information in the kvm_clock structure. Provide userspace with a host TSC & realtime pair iff the realtime clock is based on the TSC. If userspace provides KVM_SET_CLOCK with a valid realtime value, advance the KVM clock by the amount of elapsed time. Do not step the KVM clock backwards, though, as it is a monotonic oscillator. Suggested-by: Paolo Bonzini Signed-off-by: Oliver Upton Signed-off-by: Paolo Bonzini Message-Id: <20210916181538.968978-5-oupton@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 48 +++++++++++++++++++++++++++++++++-------- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/x86.c | 47 ++++++++++++++++++++++++++++------------ include/uapi/linux/kvm.h | 7 +++++- 4 files changed, 81 insertions(+), 24 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 0c0bf26..3b093d6d 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1010,20 +1010,37 @@ such as migration. When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the set of bits that KVM can return in struct kvm_clock_data's flag member. -The only flag defined now is KVM_CLOCK_TSC_STABLE. If set, the returned -value is the exact kvmclock value seen by all VCPUs at the instant -when KVM_GET_CLOCK was called. If clear, the returned value is simply -CLOCK_MONOTONIC plus a constant offset; the offset can be modified -with KVM_SET_CLOCK. KVM will try to make all VCPUs follow this clock, -but the exact value read by each VCPU could differ, because the host -TSC is not stable. +The following flags are defined: + +KVM_CLOCK_TSC_STABLE + If set, the returned value is the exact kvmclock + value seen by all VCPUs at the instant when KVM_GET_CLOCK was called. + If clear, the returned value is simply CLOCK_MONOTONIC plus a constant + offset; the offset can be modified with KVM_SET_CLOCK. KVM will try + to make all VCPUs follow this clock, but the exact value read by each + VCPU could differ, because the host TSC is not stable. + +KVM_CLOCK_REALTIME + If set, the `realtime` field in the kvm_clock_data + structure is populated with the value of the host's real time + clocksource at the instant when KVM_GET_CLOCK was called. If clear, + the `realtime` field does not contain a value. + +KVM_CLOCK_HOST_TSC + If set, the `host_tsc` field in the kvm_clock_data + structure is populated with the value of the host's timestamp counter (TSC) + at the instant when KVM_GET_CLOCK was called. If clear, the `host_tsc` field + does not contain a value. :: struct kvm_clock_data { __u64 clock; /* kvmclock current value */ __u32 flags; - __u32 pad[9]; + __u32 pad0; + __u64 realtime; + __u64 host_tsc; + __u32 pad[4]; }; @@ -1040,12 +1057,25 @@ Sets the current timestamp of kvmclock to the value specified in its parameter. In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios such as migration. +The following flags can be passed: + +KVM_CLOCK_REALTIME + If set, KVM will compare the value of the `realtime` field + with the value of the host's real time clocksource at the instant when + KVM_SET_CLOCK was called. The difference in elapsed time is added to the final + kvmclock value that will be provided to guests. + +Other flags returned by ``KVM_GET_CLOCK`` are accepted but ignored. + :: struct kvm_clock_data { __u64 clock; /* kvmclock current value */ __u32 flags; - __u32 pad[9]; + __u32 pad0; + __u64 realtime; + __u64 host_tsc; + __u32 pad[4]; }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5271fce..8b16fa5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1942,4 +1942,7 @@ int kvm_cpu_dirty_log_size(void); int alloc_all_memslots_rmaps(struct kvm *kvm); +#define KVM_CLOCK_VALID_FLAGS \ + (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC) + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3ea4f6e..d3631d1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2787,6 +2787,7 @@ static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) struct pvclock_vcpu_time_info hv_clock; unsigned long flags; + data->flags = 0; spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); if (!ka->use_master_clock) { spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); @@ -2803,10 +2804,20 @@ static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) get_cpu(); if (__this_cpu_read(cpu_tsc_khz)) { +#ifdef CONFIG_X86_64 + struct timespec64 ts; + + if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) { + data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; + data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; + } else +#endif + data->host_tsc = rdtsc(); + kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); - data->clock = __pvclock_read_cycles(&hv_clock, rdtsc()); + data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); } else { data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; } @@ -2818,12 +2829,6 @@ u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_clock_data data; - /* - * Zero flags as it's accessed RMW, leave everything else uninitialized - * as clock is always written and no other fields are consumed. - */ - data.flags = 0; - get_kvmclock(kvm, &data); return data.clock; } @@ -4050,7 +4055,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_SYNC_X86_VALID_FIELDS; break; case KVM_CAP_ADJUST_CLOCK: - r = KVM_CLOCK_TSC_STABLE; + r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | @@ -5847,12 +5852,16 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) { struct kvm_arch *ka = &kvm->arch; struct kvm_clock_data data; - u64 now_ns; + u64 now_raw_ns; if (copy_from_user(&data, argp, sizeof(data))) return -EFAULT; - if (data.flags) + /* + * Only KVM_CLOCK_REALTIME is used, but allow passing the + * result of KVM_GET_CLOCK back to KVM_SET_CLOCK. + */ + if (data.flags & ~KVM_CLOCK_VALID_FLAGS) return -EINVAL; kvm_hv_invalidate_tsc_page(kvm); @@ -5866,11 +5875,21 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) * is slightly ahead) here we risk going negative on unsigned * 'system_time' when 'data.clock' is very small. */ - if (kvm->arch.use_master_clock) - now_ns = ka->master_kernel_ns; + if (data.flags & KVM_CLOCK_REALTIME) { + u64 now_real_ns = ktime_get_real_ns(); + + /* + * Avoid stepping the kvmclock backwards. + */ + if (now_real_ns > data.realtime) + data.clock += now_real_ns - data.realtime; + } + + if (ka->use_master_clock) + now_raw_ns = ka->master_kernel_ns; else - now_ns = get_kvmclock_base_ns(); - ka->kvmclock_offset = data.clock - now_ns; + now_raw_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = data.clock - now_raw_ns; kvm_end_pvclock_update(kvm); return 0; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 322b4b5..5ca5ffe 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1231,11 +1231,16 @@ struct kvm_irqfd { /* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags. */ #define KVM_CLOCK_TSC_STABLE 2 +#define KVM_CLOCK_REALTIME (1 << 2) +#define KVM_CLOCK_HOST_TSC (1 << 3) struct kvm_clock_data { __u64 clock; __u32 flags; - __u32 pad[9]; + __u32 pad0; + __u64 realtime; + __u64 host_tsc; + __u32 pad[4]; }; /* For KVM_CAP_SW_TLB */ -- 2.7.4