kvm: nVMX: Introduce KVM_CAP_NESTED_STATE
authorJim Mattson <jmattson@google.com>
Tue, 10 Jul 2018 09:27:20 +0000 (11:27 +0200)
committerPaolo Bonzini <pbonzini@redhat.com>
Mon, 6 Aug 2018 15:58:30 +0000 (17:58 +0200)
For nested virtualization L0 KVM is managing a bit of state for L2 guests,
this state can not be captured through the currently available IOCTLs. In
fact the state captured through all of these IOCTLs is usually a mix of L1
and L2 state. It is also dependent on whether the L2 guest was running at
the moment when the process was interrupted to save its state.

With this capability, there are two new vcpu ioctls: KVM_GET_NESTED_STATE
and KVM_SET_NESTED_STATE. These can be used for saving and restoring a VM
that is in VMX operation.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Jim Mattson <jmattson@google.com>
[karahmed@ - rename structs and functions and make them ready for AMD and
             address previous comments.
           - handle nested.smm state.
           - rebase & a bit of refactoring.
           - Merge 7/8 and 8/8 into one patch. ]
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Documentation/virtual/kvm/api.txt
arch/x86/include/asm/kvm_host.h
arch/x86/include/uapi/asm/kvm.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
include/uapi/linux/kvm.h

index cb8db4f9d09794d6bc5cef908406fcc3ee76e213..7b83b176c662ce24a014299a69be27e28452a14d 100644 (file)
@@ -3561,6 +3561,62 @@ Returns: 0 on success,
        -ENOENT on deassign if the conn_id isn't registered
        -EEXIST on assign if the conn_id is already registered
 
+4.114 KVM_GET_NESTED_STATE
+
+Capability: KVM_CAP_NESTED_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_nested_state (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  E2BIG:     the total state size (including the fixed-size part of struct
+             kvm_nested_state) exceeds the value of 'size' specified by
+             the user; the size required will be written into size.
+
+struct kvm_nested_state {
+       __u16 flags;
+       __u16 format;
+       __u32 size;
+       union {
+               struct kvm_vmx_nested_state vmx;
+               struct kvm_svm_nested_state svm;
+               __u8 pad[120];
+       };
+       __u8 data[0];
+};
+
+#define KVM_STATE_NESTED_GUEST_MODE    0x00000001
+#define KVM_STATE_NESTED_RUN_PENDING   0x00000002
+
+#define KVM_STATE_NESTED_SMM_GUEST_MODE        0x00000001
+#define KVM_STATE_NESTED_SMM_VMXON     0x00000002
+
+struct kvm_vmx_nested_state {
+       __u64 vmxon_pa;
+       __u64 vmcs_pa;
+
+       struct {
+               __u16 flags;
+       } smm;
+};
+
+This ioctl copies the vcpu's nested virtualization state from the kernel to
+userspace.
+
+The maximum size of the state, including the fixed-size part of struct
+kvm_nested_state, can be retrieved by passing KVM_CAP_NESTED_STATE to
+the KVM_CHECK_EXTENSION ioctl().
+
+4.115 KVM_SET_NESTED_STATE
+
+Capability: KVM_CAP_NESTED_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_nested_state (in)
+Returns: 0 on success, -1 on error
+
+This copies the vcpu's kvm_nested_state struct from userspace to the kernel.  For
+the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE.
 
 5. The kvm_run structure
 ------------------------
index da957725992de02b7ddbd32b4508473fd3e70d2e..bd287b348751a498e304d9448d7ba6bbdb049822 100644 (file)
@@ -1086,6 +1086,12 @@ struct kvm_x86_ops {
 
        void (*setup_mce)(struct kvm_vcpu *vcpu);
 
+       int (*get_nested_state)(struct kvm_vcpu *vcpu,
+                               struct kvm_nested_state __user *user_kvm_nested_state,
+                               unsigned user_data_size);
+       int (*set_nested_state)(struct kvm_vcpu *vcpu,
+                               struct kvm_nested_state __user *user_kvm_nested_state,
+                               struct kvm_nested_state *kvm_state);
        void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
 
        int (*smi_allowed)(struct kvm_vcpu *vcpu);
index c535c2fdea136a5e58725e72a2d867f545d393eb..86299efa804adbfc35d3338f9fe75083e6cdf5af 100644 (file)
@@ -378,4 +378,41 @@ struct kvm_sync_regs {
 #define KVM_X86_QUIRK_LINT0_REENABLED  (1 << 0)
 #define KVM_X86_QUIRK_CD_NW_CLEARED    (1 << 1)
 
+#define KVM_STATE_NESTED_GUEST_MODE    0x00000001
+#define KVM_STATE_NESTED_RUN_PENDING   0x00000002
+
+#define KVM_STATE_NESTED_SMM_GUEST_MODE        0x00000001
+#define KVM_STATE_NESTED_SMM_VMXON     0x00000002
+
+struct kvm_vmx_nested_state {
+       __u64 vmxon_pa;
+       __u64 vmcs_pa;
+
+       struct {
+               __u16 flags;
+       } smm;
+};
+
+/* for KVM_CAP_NESTED_STATE */
+struct kvm_nested_state {
+       /* KVM_STATE_* flags */
+       __u16 flags;
+
+       /* 0 for VMX, 1 for SVM.  */
+       __u16 format;
+
+       /* 128 for SVM, 128 + VMCS size for VMX.  */
+       __u32 size;
+
+       union {
+               /* VMXON, VMCS */
+               struct kvm_vmx_nested_state vmx;
+
+               /* Pad the header to 128 bytes.  */
+               __u8 pad[120];
+       };
+
+       __u8 data[0];
+};
+
 #endif /* _ASM_X86_KVM_H */
index fee44e4c5c7907fc6ba1c4a4883eaae6a71fb377..4be6486173b7d4195e1681e91e14c4bfdf305c85 100644 (file)
@@ -7589,6 +7589,11 @@ static __init int hardware_setup(void)
        else
                kvm_disable_tdp();
 
+       if (!nested) {
+               kvm_x86_ops->get_nested_state = NULL;
+               kvm_x86_ops->set_nested_state = NULL;
+       }
+
        /*
         * Only enable PML when hardware supports PML feature, and both EPT
         * and EPT A/D bit features are enabled -- PML depends on them to work.
@@ -11775,8 +11780,8 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 }
 
 /*
- * If exit_qual is NULL, this is being called from RSM.
- * Otherwise it's called from vmlaunch/vmresume.
+ * If exit_qual is NULL, this is being called from state restore (either RSM
+ * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
  */
 static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
 {
@@ -13016,6 +13021,170 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
+                               struct kvm_nested_state __user *user_kvm_nested_state,
+                               u32 user_data_size)
+{
+       struct vcpu_vmx *vmx;
+       struct vmcs12 *vmcs12;
+       struct kvm_nested_state kvm_state = {
+               .flags = 0,
+               .format = 0,
+               .size = sizeof(kvm_state),
+               .vmx.vmxon_pa = -1ull,
+               .vmx.vmcs_pa = -1ull,
+       };
+
+       if (!vcpu)
+               return kvm_state.size + 2 * VMCS12_SIZE;
+
+       vmx = to_vmx(vcpu);
+       vmcs12 = get_vmcs12(vcpu);
+       if (nested_vmx_allowed(vcpu) &&
+           (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
+               kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
+               kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
+
+               if (vmx->nested.current_vmptr != -1ull)
+                       kvm_state.size += VMCS12_SIZE;
+
+               if (vmx->nested.smm.vmxon)
+                       kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
+
+               if (vmx->nested.smm.guest_mode)
+                       kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
+
+               if (is_guest_mode(vcpu)) {
+                       kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+                       if (vmx->nested.nested_run_pending)
+                               kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+               }
+       }
+
+       if (user_data_size < kvm_state.size)
+               goto out;
+
+       if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+               return -EFAULT;
+
+       if (vmx->nested.current_vmptr == -1ull)
+               goto out;
+
+       /*
+        * When running L2, the authoritative vmcs12 state is in the
+        * vmcs02. When running L1, the authoritative vmcs12 state is
+        * in the shadow vmcs linked to vmcs01, unless
+        * sync_shadow_vmcs is set, in which case, the authoritative
+        * vmcs12 state is in the vmcs12 already.
+        */
+       if (is_guest_mode(vcpu))
+               sync_vmcs12(vcpu, vmcs12);
+       else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
+               copy_shadow_to_vmcs12(vmx);
+
+       if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
+               return -EFAULT;
+
+out:
+       return kvm_state.size;
+}
+
+static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
+                               struct kvm_nested_state __user *user_kvm_nested_state,
+                               struct kvm_nested_state *kvm_state)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct vmcs12 *vmcs12;
+       u32 exit_qual;
+       int ret;
+
+       if (kvm_state->format != 0)
+               return -EINVAL;
+
+       if (!nested_vmx_allowed(vcpu))
+               return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
+
+       if (kvm_state->vmx.vmxon_pa == -1ull) {
+               if (kvm_state->vmx.smm.flags)
+                       return -EINVAL;
+
+               if (kvm_state->vmx.vmcs_pa != -1ull)
+                       return -EINVAL;
+
+               vmx_leave_nested(vcpu);
+               return 0;
+       }
+
+       if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
+               return -EINVAL;
+
+       if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
+               return -EINVAL;
+
+       if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
+           !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
+               return -EINVAL;
+
+       if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+           (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+               return -EINVAL;
+
+       if (kvm_state->vmx.smm.flags &
+           ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
+               return -EINVAL;
+
+       if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+           !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
+               return -EINVAL;
+
+       vmx_leave_nested(vcpu);
+       if (kvm_state->vmx.vmxon_pa == -1ull)
+               return 0;
+
+       vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
+       ret = enter_vmx_operation(vcpu);
+       if (ret)
+               return ret;
+
+       set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
+
+       if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
+               vmx->nested.smm.vmxon = true;
+               vmx->nested.vmxon = false;
+
+               if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
+                       vmx->nested.smm.guest_mode = true;
+       }
+
+       vmcs12 = get_vmcs12(vcpu);
+       if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
+               return -EFAULT;
+
+       if (vmcs12->revision_id != VMCS12_REVISION)
+               return -EINVAL;
+
+       if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+               return 0;
+
+       vmx->nested.nested_run_pending =
+               !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+       if (check_vmentry_prereqs(vcpu, vmcs12) ||
+           check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+               return -EINVAL;
+
+       if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
+               vmx->nested.nested_run_pending = 1;
+
+       vmx->nested.dirty_vmcs12 = true;
+       ret = enter_vmx_non_root_mode(vcpu, NULL);
+       if (ret)
+               return -EINVAL;
+
+       return 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
@@ -13150,6 +13319,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
        .setup_mce = vmx_setup_mce,
 
+       .get_nested_state = vmx_get_nested_state,
+       .set_nested_state = vmx_set_nested_state,
        .get_vmcs12_pages = nested_get_vmcs12_pages,
 
        .smi_allowed = vmx_smi_allowed,
index fbd59ad047b0dc5d1bef81d79ef253837e584af0..1b14c4a654c32baf770f13307e698407bb0c2989 100644 (file)
@@ -2947,6 +2947,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_X2APIC_API:
                r = KVM_X2APIC_API_VALID_FLAGS;
                break;
+       case KVM_CAP_NESTED_STATE:
+               r = kvm_x86_ops->get_nested_state ?
+                       kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
+               break;
        default:
                break;
        }
@@ -3963,6 +3967,56 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
                break;
        }
+       case KVM_GET_NESTED_STATE: {
+               struct kvm_nested_state __user *user_kvm_nested_state = argp;
+               u32 user_data_size;
+
+               r = -EINVAL;
+               if (!kvm_x86_ops->get_nested_state)
+                       break;
+
+               BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
+               if (get_user(user_data_size, &user_kvm_nested_state->size))
+                       return -EFAULT;
+
+               r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
+                                                 user_data_size);
+               if (r < 0)
+                       return r;
+
+               if (r > user_data_size) {
+                       if (put_user(r, &user_kvm_nested_state->size))
+                               return -EFAULT;
+                       return -E2BIG;
+               }
+               r = 0;
+               break;
+       }
+       case KVM_SET_NESTED_STATE: {
+               struct kvm_nested_state __user *user_kvm_nested_state = argp;
+               struct kvm_nested_state kvm_state;
+
+               r = -EINVAL;
+               if (!kvm_x86_ops->set_nested_state)
+                       break;
+
+               if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
+                       return -EFAULT;
+
+               if (kvm_state.size < sizeof(kvm_state))
+                       return -EINVAL;
+
+               if (kvm_state.flags &
+                   ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
+                       return -EINVAL;
+
+               /* nested_run_pending implies guest_mode.  */
+               if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
+                       return -EINVAL;
+
+               r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+               break;
+       }
        default:
                r = -EINVAL;
        }
index b955b986b3413ac2fd2a7262b6bbee29006a65aa..3cf632839337b7cf0288971e2612376d52de636d 100644 (file)
@@ -950,6 +950,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_HYPERV_EVENTFD 154
 #define KVM_CAP_HYPERV_TLBFLUSH 155
 #define KVM_CAP_S390_HPAGE_1M 156
+#define KVM_CAP_NESTED_STATE 157
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1392,6 +1393,9 @@ struct kvm_enc_region {
 /* Available with KVM_CAP_HYPERV_EVENTFD */
 #define KVM_HYPERV_EVENTFD        _IOW(KVMIO,  0xbd, struct kvm_hyperv_eventfd)
 
+/* Available with KVM_CAP_NESTED_STATE */
+#define KVM_GET_NESTED_STATE         _IOWR(KVMIO, 0xbe, struct kvm_nested_state)
+#define KVM_SET_NESTED_STATE         _IOW(KVMIO,  0xbf, struct kvm_nested_state)
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {