Merge tag 'hyperv-next-signed-20210426' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Apr 2021 17:44:16 +0000 (10:44 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Apr 2021 17:44:16 +0000 (10:44 -0700)
Pull Hyper-V updates from Wei Liu:

 - VMBus enhancement

 - Free page reporting support for Hyper-V balloon driver

 - Some patches for running Linux as Arm64 Hyper-V guest

 - A few misc clean-up patches

* tag 'hyperv-next-signed-20210426' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (30 commits)
  drivers: hv: Create a consistent pattern for checking Hyper-V hypercall status
  x86/hyperv: Move hv_do_rep_hypercall to asm-generic
  video: hyperv_fb: Add ratelimit on error message
  Drivers: hv: vmbus: Increase wait time for VMbus unload
  Drivers: hv: vmbus: Initialize unload_event statically
  Drivers: hv: vmbus: Check for pending channel interrupts before taking a CPU offline
  Drivers: hv: vmbus: Drivers: hv: vmbus: Introduce CHANNELMSG_MODIFYCHANNEL_RESPONSE
  Drivers: hv: vmbus: Introduce and negotiate VMBus protocol version 5.3
  Drivers: hv: vmbus: Use after free in __vmbus_open()
  Drivers: hv: vmbus: remove unused function
  Drivers: hv: vmbus: Remove unused linux/version.h header
  x86/hyperv: remove unused linux/version.h header
  x86/Hyper-V: Support for free page reporting
  x86/hyperv: Fix unused variable 'hi' warning in hv_apic_read
  x86/hyperv: Fix unused variable 'msr_val' warning in hv_qlock_wait
  hv: hyperv.h: a few mundane typo fixes
  drivers: hv: Fix EXPORT_SYMBOL and tab spaces issue
  Drivers: hv: vmbus: Drop error message when 'No request id available'
  asm-generic/hyperv: Add missing function prototypes per -W1 warnings
  clocksource/drivers/hyper-v: Move handling of STIMER0 interrupts
  ...

26 files changed:
arch/x86/hyperv/hv_apic.c
arch/x86/hyperv/hv_init.c
arch/x86/hyperv/hv_proc.c
arch/x86/hyperv/hv_spinlock.c
arch/x86/hyperv/irqdomain.c
arch/x86/hyperv/mmu.c
arch/x86/hyperv/nested.c
arch/x86/include/asm/hyperv-tlfs.h
arch/x86/include/asm/mshyperv.h
arch/x86/kernel/cpu/mshyperv.c
drivers/clocksource/hyperv_timer.c
drivers/hv/Kconfig
drivers/hv/channel.c
drivers/hv/channel_mgmt.c
drivers/hv/connection.c
drivers/hv/hv.c
drivers/hv/hv_balloon.c
drivers/hv/hv_trace.h
drivers/hv/ring_buffer.c
drivers/hv/vmbus_drv.c
drivers/pci/controller/pci-hyperv.c
drivers/video/fbdev/hyperv_fb.c
include/asm-generic/hyperv-tlfs.h
include/asm-generic/mshyperv.h
include/clocksource/hyperv_timer.h
include/linux/hyperv.h

index 284e736..90e682a 100644 (file)
@@ -60,9 +60,11 @@ static u32 hv_apic_read(u32 reg)
        switch (reg) {
        case APIC_EOI:
                rdmsr(HV_X64_MSR_EOI, reg_val, hi);
+               (void)hi;
                return reg_val;
        case APIC_TASKPRI:
                rdmsr(HV_X64_MSR_TPR, reg_val, hi);
+               (void)hi;
                return reg_val;
 
        default:
@@ -103,7 +105,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
        struct hv_send_ipi_ex *ipi_arg;
        unsigned long flags;
        int nr_bank = 0;
-       int ret = 1;
+       u64 status = HV_STATUS_INVALID_PARAMETER;
 
        if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
                return false;
@@ -128,19 +130,19 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
        if (!nr_bank)
                ipi_arg->vp_set.format = HV_GENERIC_SET_ALL;
 
-       ret = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
+       status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
                              ipi_arg, NULL);
 
 ipi_mask_ex_done:
        local_irq_restore(flags);
-       return ((ret == 0) ? true : false);
+       return hv_result_success(status);
 }
 
 static bool __send_ipi_mask(const struct cpumask *mask, int vector)
 {
        int cur_cpu, vcpu;
        struct hv_send_ipi ipi_arg;
-       int ret = 1;
+       u64 status;
 
        trace_hyperv_send_ipi_mask(mask, vector);
 
@@ -184,9 +186,9 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector)
                __set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask);
        }
 
-       ret = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
+       status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
                                     ipi_arg.cpu_mask);
-       return ((ret == 0) ? true : false);
+       return hv_result_success(status);
 
 do_ex_hypercall:
        return __send_ipi_mask_ex(mask, vector);
@@ -195,6 +197,7 @@ do_ex_hypercall:
 static bool __send_ipi_one(int cpu, int vector)
 {
        int vp = hv_cpu_number_to_vp_number(cpu);
+       u64 status;
 
        trace_hyperv_send_ipi_one(cpu, vector);
 
@@ -207,7 +210,8 @@ static bool __send_ipi_one(int cpu, int vector)
        if (vp >= 64)
                return __send_ipi_mask_ex(cpumask_of(cpu), vector);
 
-       return !hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp));
+       status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp));
+       return hv_result_success(status);
 }
 
 static void hv_send_ipi(int cpu, int vector)
index e7b94f6..bb0ae4b 100644 (file)
@@ -54,28 +54,6 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
 u32 hv_max_vp_index;
 EXPORT_SYMBOL_GPL(hv_max_vp_index);
 
-void *hv_alloc_hyperv_page(void)
-{
-       BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE);
-
-       return (void *)__get_free_page(GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
-
-void *hv_alloc_hyperv_zeroed_page(void)
-{
-        BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE);
-
-        return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-}
-EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page);
-
-void hv_free_hyperv_page(unsigned long addr)
-{
-       free_page(addr);
-}
-EXPORT_SYMBOL_GPL(hv_free_hyperv_page);
-
 static int hv_cpu_init(unsigned int cpu)
 {
        u64 msr_vp_index;
@@ -97,7 +75,7 @@ static int hv_cpu_init(unsigned int cpu)
                *output_arg = page_address(pg + 1);
        }
 
-       hv_get_vp_index(msr_vp_index);
+       msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX);
 
        hv_vp_index[smp_processor_id()] = msr_vp_index;
 
@@ -349,7 +327,7 @@ static void __init hv_stimer_setup_percpu_clockev(void)
         * Ignore any errors in setting up stimer clockevents
         * as we can run with the LAPIC timer as a fallback.
         */
-       (void)hv_stimer_alloc();
+       (void)hv_stimer_alloc(false);
 
        /*
         * Still register the LAPIC timer, because the direct-mode STIMER is
@@ -369,7 +347,7 @@ static void __init hv_get_partition_id(void)
        local_irq_save(flags);
        output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
        status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
-       if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+       if (!hv_result_success(status)) {
                /* No point in proceeding if this failed */
                pr_err("Failed to get partition ID: %lld\n", status);
                BUG();
@@ -520,6 +498,8 @@ void __init hyperv_init(void)
                x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
 #endif
 
+       /* Query the VMs extended capability once, so that it can be cached. */
+       hv_query_ext_cap(0);
        return;
 
 remove_cpuhp_state:
@@ -593,33 +573,6 @@ void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die)
 }
 EXPORT_SYMBOL_GPL(hyperv_report_panic);
 
-/**
- * hyperv_report_panic_msg - report panic message to Hyper-V
- * @pa: physical address of the panic page containing the message
- * @size: size of the message in the page
- */
-void hyperv_report_panic_msg(phys_addr_t pa, size_t size)
-{
-       /*
-        * P3 to contain the physical address of the panic page & P4 to
-        * contain the size of the panic data in that page. Rest of the
-        * registers are no-op when the NOTIFY_MSG flag is set.
-        */
-       wrmsrl(HV_X64_MSR_CRASH_P0, 0);
-       wrmsrl(HV_X64_MSR_CRASH_P1, 0);
-       wrmsrl(HV_X64_MSR_CRASH_P2, 0);
-       wrmsrl(HV_X64_MSR_CRASH_P3, pa);
-       wrmsrl(HV_X64_MSR_CRASH_P4, size);
-
-       /*
-        * Let Hyper-V know there is crash data available along with
-        * the panic message.
-        */
-       wrmsrl(HV_X64_MSR_CRASH_CTL,
-              (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG));
-}
-EXPORT_SYMBOL_GPL(hyperv_report_panic_msg);
-
 bool hv_is_hyperv_initialized(void)
 {
        union hv_x64_msr_hypercall_contents hypercall_msr;
@@ -650,7 +603,7 @@ EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
 
 enum hv_isolation_type hv_get_isolation_type(void)
 {
-       if (!(ms_hyperv.features_b & HV_ISOLATION))
+       if (!(ms_hyperv.priv_high & HV_ISOLATION))
                return HV_ISOLATION_TYPE_NONE;
        return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
 }
@@ -661,3 +614,50 @@ bool hv_is_isolation_supported(void)
        return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
 }
 EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
+
+/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
+bool hv_query_ext_cap(u64 cap_query)
+{
+       /*
+        * The address of the 'hv_extended_cap' variable will be used as an
+        * output parameter to the hypercall below and so it should be
+        * compatible with 'virt_to_phys'. Which means, it's address should be
+        * directly mapped. Use 'static' to keep it compatible; stack variables
+        * can be virtually mapped, making them imcompatible with
+        * 'virt_to_phys'.
+        * Hypercall input/output addresses should also be 8-byte aligned.
+        */
+       static u64 hv_extended_cap __aligned(8);
+       static bool hv_extended_cap_queried;
+       u64 status;
+
+       /*
+        * Querying extended capabilities is an extended hypercall. Check if the
+        * partition supports extended hypercall, first.
+        */
+       if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
+               return false;
+
+       /* Extended capabilities do not change at runtime. */
+       if (hv_extended_cap_queried)
+               return hv_extended_cap & cap_query;
+
+       status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL,
+                                &hv_extended_cap);
+
+       /*
+        * The query extended capabilities hypercall should not fail under
+        * any normal circumstances. Avoid repeatedly making the hypercall, on
+        * error.
+        */
+       hv_extended_cap_queried = true;
+       status &= HV_HYPERCALL_RESULT_MASK;
+       if (status != HV_STATUS_SUCCESS) {
+               pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n",
+                      status);
+               return false;
+       }
+
+       return hv_extended_cap & cap_query;
+}
+EXPORT_SYMBOL_GPL(hv_query_ext_cap);
index 60461e5..68a0843 100644 (file)
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/types.h>
-#include <linux/version.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/clockchips.h>
@@ -93,10 +92,9 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
        status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
                                     page_count, 0, input_page, NULL);
        local_irq_restore(flags);
-
-       if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+       if (!hv_result_success(status)) {
                pr_err("Failed to deposit pages: %lld\n", status);
-               ret = status;
+               ret = hv_result(status);
                goto err_free_allocations;
        }
 
@@ -122,7 +120,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
        struct hv_add_logical_processor_out *output;
        u64 status;
        unsigned long flags;
-       int ret = 0;
+       int ret = HV_STATUS_SUCCESS;
        int pxm = node_to_pxm(node);
 
        /*
@@ -148,13 +146,11 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
                                         input, output);
                local_irq_restore(flags);
 
-               status &= HV_HYPERCALL_RESULT_MASK;
-
-               if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
-                       if (status != HV_STATUS_SUCCESS) {
+               if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+                       if (!hv_result_success(status)) {
                                pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
                                       lp_index, apic_id, status);
-                               ret = status;
+                               ret = hv_result(status);
                        }
                        break;
                }
@@ -169,7 +165,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
        struct hv_create_vp *input;
        u64 status;
        unsigned long irq_flags;
-       int ret = 0;
+       int ret = HV_STATUS_SUCCESS;
        int pxm = node_to_pxm(node);
 
        /* Root VPs don't seem to need pages deposited */
@@ -200,13 +196,11 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
                status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
                local_irq_restore(irq_flags);
 
-               status &= HV_HYPERCALL_RESULT_MASK;
-
-               if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
-                       if (status != HV_STATUS_SUCCESS) {
+               if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+                       if (!hv_result_success(status)) {
                                pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
                                       vp_index, flags, status);
-                               ret = status;
+                               ret = hv_result(status);
                        }
                        break;
                }
index f3270c1..91cfe69 100644 (file)
@@ -25,7 +25,6 @@ static void hv_qlock_kick(int cpu)
 
 static void hv_qlock_wait(u8 *byte, u8 val)
 {
-       unsigned long msr_val;
        unsigned long flags;
 
        if (in_nmi())
@@ -48,8 +47,13 @@ static void hv_qlock_wait(u8 *byte, u8 val)
        /*
         * Only issue the rdmsrl() when the lock state has not changed.
         */
-       if (READ_ONCE(*byte) == val)
+       if (READ_ONCE(*byte) == val) {
+               unsigned long msr_val;
+
                rdmsrl(HV_X64_MSR_GUEST_IDLE, msr_val);
+
+               (void)msr_val;
+       }
        local_irq_restore(flags);
 }
 
index 4421a8d..514fc64 100644 (file)
@@ -63,10 +63,10 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
 
        local_irq_restore(flags);
 
-       if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
+       if (!hv_result_success(status))
                pr_err("%s: hypercall failed, status %lld\n", __func__, status);
 
-       return status & HV_HYPERCALL_RESULT_MASK;
+       return hv_result(status);
 }
 
 static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
@@ -88,7 +88,7 @@ static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
        status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
        local_irq_restore(flags);
 
-       return status & HV_HYPERCALL_RESULT_MASK;
+       return hv_result(status);
 }
 
 #ifdef CONFIG_PCI_MSI
index 2c87350..c0ba887 100644 (file)
@@ -58,7 +58,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
        int cpu, vcpu, gva_n, max_gvas;
        struct hv_tlb_flush **flush_pcpu;
        struct hv_tlb_flush *flush;
-       u64 status = U64_MAX;
+       u64 status;
        unsigned long flags;
 
        trace_hyperv_mmu_flush_tlb_others(cpus, info);
@@ -161,7 +161,7 @@ do_ex_hypercall:
 check_status:
        local_irq_restore(flags);
 
-       if (!(status & HV_HYPERCALL_RESULT_MASK))
+       if (hv_result_success(status))
                return;
 do_native:
        native_flush_tlb_others(cpus, info);
@@ -176,7 +176,7 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
        u64 status;
 
        if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
-               return U64_MAX;
+               return HV_STATUS_INVALID_PARAMETER;
 
        flush_pcpu = (struct hv_tlb_flush_ex **)
                     this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -201,7 +201,7 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
        flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
        nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
        if (nr_bank < 0)
-               return U64_MAX;
+               return HV_STATUS_INVALID_PARAMETER;
 
        /*
         * We can flush not more than max_gvas with one hypercall. Flush the
index dd0a843..5d70968 100644 (file)
@@ -47,7 +47,7 @@ int hyperv_flush_guest_mapping(u64 as)
                                 flush, NULL);
        local_irq_restore(flags);
 
-       if (!(status & HV_HYPERCALL_RESULT_MASK))
+       if (hv_result_success(status))
                ret = 0;
 
 fault:
@@ -92,7 +92,7 @@ int hyperv_flush_guest_mapping_range(u64 as,
 {
        struct hv_guest_mapping_flush_list **flush_pcpu;
        struct hv_guest_mapping_flush_list *flush;
-       u64 status = 0;
+       u64 status;
        unsigned long flags;
        int ret = -ENOTSUPP;
        int gpa_n = 0;
@@ -125,10 +125,10 @@ int hyperv_flush_guest_mapping_range(u64 as,
 
        local_irq_restore(flags);
 
-       if (!(status & HV_HYPERCALL_RESULT_MASK))
+       if (hv_result_success(status))
                ret = 0;
        else
-               ret = status;
+               ret = hv_result(status);
 fault:
        trace_hyperv_nested_flush_guest_mapping_range(as, ret);
        return ret;
index e6cd3fe..606f5cc 100644 (file)
@@ -156,7 +156,7 @@ enum hv_isolation_type {
 #define HV_X64_MSR_HYPERCALL                   0x40000001
 
 /* MSR used to provide vcpu index */
-#define HV_X64_MSR_VP_INDEX                    0x40000002
+#define HV_REGISTER_VP_INDEX                   0x40000002
 
 /* MSR used to reset the guest OS. */
 #define HV_X64_MSR_RESET                       0x40000003
@@ -165,10 +165,10 @@ enum hv_isolation_type {
 #define HV_X64_MSR_VP_RUNTIME                  0x40000010
 
 /* MSR used to read the per-partition time reference counter */
-#define HV_X64_MSR_TIME_REF_COUNT              0x40000020
+#define HV_REGISTER_TIME_REF_COUNT             0x40000020
 
 /* A partition's reference time stamp counter (TSC) page */
-#define HV_X64_MSR_REFERENCE_TSC               0x40000021
+#define HV_REGISTER_REFERENCE_TSC              0x40000021
 
 /* MSR used to retrieve the TSC frequency */
 #define HV_X64_MSR_TSC_FREQUENCY               0x40000022
@@ -183,50 +183,50 @@ enum hv_isolation_type {
 #define HV_X64_MSR_VP_ASSIST_PAGE              0x40000073
 
 /* Define synthetic interrupt controller model specific registers. */
-#define HV_X64_MSR_SCONTROL                    0x40000080
-#define HV_X64_MSR_SVERSION                    0x40000081
-#define HV_X64_MSR_SIEFP                       0x40000082
-#define HV_X64_MSR_SIMP                                0x40000083
-#define HV_X64_MSR_EOM                         0x40000084
-#define HV_X64_MSR_SINT0                       0x40000090
-#define HV_X64_MSR_SINT1                       0x40000091
-#define HV_X64_MSR_SINT2                       0x40000092
-#define HV_X64_MSR_SINT3                       0x40000093
-#define HV_X64_MSR_SINT4                       0x40000094
-#define HV_X64_MSR_SINT5                       0x40000095
-#define HV_X64_MSR_SINT6                       0x40000096
-#define HV_X64_MSR_SINT7                       0x40000097
-#define HV_X64_MSR_SINT8                       0x40000098
-#define HV_X64_MSR_SINT9                       0x40000099
-#define HV_X64_MSR_SINT10                      0x4000009A
-#define HV_X64_MSR_SINT11                      0x4000009B
-#define HV_X64_MSR_SINT12                      0x4000009C
-#define HV_X64_MSR_SINT13                      0x4000009D
-#define HV_X64_MSR_SINT14                      0x4000009E
-#define HV_X64_MSR_SINT15                      0x4000009F
+#define HV_REGISTER_SCONTROL                   0x40000080
+#define HV_REGISTER_SVERSION                   0x40000081
+#define HV_REGISTER_SIEFP                      0x40000082
+#define HV_REGISTER_SIMP                       0x40000083
+#define HV_REGISTER_EOM                                0x40000084
+#define HV_REGISTER_SINT0                      0x40000090
+#define HV_REGISTER_SINT1                      0x40000091
+#define HV_REGISTER_SINT2                      0x40000092
+#define HV_REGISTER_SINT3                      0x40000093
+#define HV_REGISTER_SINT4                      0x40000094
+#define HV_REGISTER_SINT5                      0x40000095
+#define HV_REGISTER_SINT6                      0x40000096
+#define HV_REGISTER_SINT7                      0x40000097
+#define HV_REGISTER_SINT8                      0x40000098
+#define HV_REGISTER_SINT9                      0x40000099
+#define HV_REGISTER_SINT10                     0x4000009A
+#define HV_REGISTER_SINT11                     0x4000009B
+#define HV_REGISTER_SINT12                     0x4000009C
+#define HV_REGISTER_SINT13                     0x4000009D
+#define HV_REGISTER_SINT14                     0x4000009E
+#define HV_REGISTER_SINT15                     0x4000009F
 
 /*
  * Synthetic Timer MSRs. Four timers per vcpu.
  */
-#define HV_X64_MSR_STIMER0_CONFIG              0x400000B0
-#define HV_X64_MSR_STIMER0_COUNT               0x400000B1
-#define HV_X64_MSR_STIMER1_CONFIG              0x400000B2
-#define HV_X64_MSR_STIMER1_COUNT               0x400000B3
-#define HV_X64_MSR_STIMER2_CONFIG              0x400000B4
-#define HV_X64_MSR_STIMER2_COUNT               0x400000B5
-#define HV_X64_MSR_STIMER3_CONFIG              0x400000B6
-#define HV_X64_MSR_STIMER3_COUNT               0x400000B7
+#define HV_REGISTER_STIMER0_CONFIG             0x400000B0
+#define HV_REGISTER_STIMER0_COUNT              0x400000B1
+#define HV_REGISTER_STIMER1_CONFIG             0x400000B2
+#define HV_REGISTER_STIMER1_COUNT              0x400000B3
+#define HV_REGISTER_STIMER2_CONFIG             0x400000B4
+#define HV_REGISTER_STIMER2_COUNT              0x400000B5
+#define HV_REGISTER_STIMER3_CONFIG             0x400000B6
+#define HV_REGISTER_STIMER3_COUNT              0x400000B7
 
 /* Hyper-V guest idle MSR */
 #define HV_X64_MSR_GUEST_IDLE                  0x400000F0
 
 /* Hyper-V guest crash notification MSR's */
-#define HV_X64_MSR_CRASH_P0                    0x40000100
-#define HV_X64_MSR_CRASH_P1                    0x40000101
-#define HV_X64_MSR_CRASH_P2                    0x40000102
-#define HV_X64_MSR_CRASH_P3                    0x40000103
-#define HV_X64_MSR_CRASH_P4                    0x40000104
-#define HV_X64_MSR_CRASH_CTL                   0x40000105
+#define HV_REGISTER_CRASH_P0                   0x40000100
+#define HV_REGISTER_CRASH_P1                   0x40000101
+#define HV_REGISTER_CRASH_P2                   0x40000102
+#define HV_REGISTER_CRASH_P3                   0x40000103
+#define HV_REGISTER_CRASH_P4                   0x40000104
+#define HV_REGISTER_CRASH_CTL                  0x40000105
 
 /* TSC emulation after migration */
 #define HV_X64_MSR_REENLIGHTENMENT_CONTROL     0x40000106
@@ -236,6 +236,32 @@ enum hv_isolation_type {
 /* TSC invariant control */
 #define HV_X64_MSR_TSC_INVARIANT_CONTROL       0x40000118
 
+/* Register name aliases for temporary compatibility */
+#define HV_X64_MSR_STIMER0_COUNT       HV_REGISTER_STIMER0_COUNT
+#define HV_X64_MSR_STIMER0_CONFIG      HV_REGISTER_STIMER0_CONFIG
+#define HV_X64_MSR_STIMER1_COUNT       HV_REGISTER_STIMER1_COUNT
+#define HV_X64_MSR_STIMER1_CONFIG      HV_REGISTER_STIMER1_CONFIG
+#define HV_X64_MSR_STIMER2_COUNT       HV_REGISTER_STIMER2_COUNT
+#define HV_X64_MSR_STIMER2_CONFIG      HV_REGISTER_STIMER2_CONFIG
+#define HV_X64_MSR_STIMER3_COUNT       HV_REGISTER_STIMER3_COUNT
+#define HV_X64_MSR_STIMER3_CONFIG      HV_REGISTER_STIMER3_CONFIG
+#define HV_X64_MSR_SCONTROL            HV_REGISTER_SCONTROL
+#define HV_X64_MSR_SVERSION            HV_REGISTER_SVERSION
+#define HV_X64_MSR_SIMP                        HV_REGISTER_SIMP
+#define HV_X64_MSR_SIEFP               HV_REGISTER_SIEFP
+#define HV_X64_MSR_VP_INDEX            HV_REGISTER_VP_INDEX
+#define HV_X64_MSR_EOM                 HV_REGISTER_EOM
+#define HV_X64_MSR_SINT0               HV_REGISTER_SINT0
+#define HV_X64_MSR_SINT15              HV_REGISTER_SINT15
+#define HV_X64_MSR_CRASH_P0            HV_REGISTER_CRASH_P0
+#define HV_X64_MSR_CRASH_P1            HV_REGISTER_CRASH_P1
+#define HV_X64_MSR_CRASH_P2            HV_REGISTER_CRASH_P2
+#define HV_X64_MSR_CRASH_P3            HV_REGISTER_CRASH_P3
+#define HV_X64_MSR_CRASH_P4            HV_REGISTER_CRASH_P4
+#define HV_X64_MSR_CRASH_CTL           HV_REGISTER_CRASH_CTL
+#define HV_X64_MSR_TIME_REF_COUNT      HV_REGISTER_TIME_REF_COUNT
+#define HV_X64_MSR_REFERENCE_TSC       HV_REGISTER_REFERENCE_TSC
+
 /*
  * Declare the MSR used to setup pages used to communicate with the hypervisor.
  */
@@ -288,35 +314,6 @@ struct hv_tsc_emulation_status {
 #define HV_X64_MSR_TSC_REFERENCE_ENABLE                0x00000001
 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
 
-
-/* Define hypervisor message types. */
-enum hv_message_type {
-       HVMSG_NONE                      = 0x00000000,
-
-       /* Memory access messages. */
-       HVMSG_UNMAPPED_GPA              = 0x80000000,
-       HVMSG_GPA_INTERCEPT             = 0x80000001,
-
-       /* Timer notification messages. */
-       HVMSG_TIMER_EXPIRED             = 0x80000010,
-
-       /* Error messages. */
-       HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020,
-       HVMSG_UNRECOVERABLE_EXCEPTION   = 0x80000021,
-       HVMSG_UNSUPPORTED_FEATURE       = 0x80000022,
-
-       /* Trace buffer complete messages. */
-       HVMSG_EVENTLOG_BUFFERCOMPLETE   = 0x80000040,
-
-       /* Platform-specific processor intercept messages. */
-       HVMSG_X64_IOPORT_INTERCEPT      = 0x80010000,
-       HVMSG_X64_MSR_INTERCEPT         = 0x80010001,
-       HVMSG_X64_CPUID_INTERCEPT       = 0x80010002,
-       HVMSG_X64_EXCEPTION_INTERCEPT   = 0x80010003,
-       HVMSG_X64_APIC_EOI              = 0x80010004,
-       HVMSG_X64_LEGACY_FP_ERROR       = 0x80010005
-};
-
 struct hv_nested_enlightenments_control {
        struct {
                __u32 directhypercall:1;
index e7be720..67ff0d6 100644 (file)
@@ -9,70 +9,29 @@
 #include <asm/hyperv-tlfs.h>
 #include <asm/nospec-branch.h>
 #include <asm/paravirt.h>
+#include <asm/mshyperv.h>
 
 typedef int (*hyperv_fill_flush_list_func)(
                struct hv_guest_mapping_flush_list *flush,
                void *data);
 
-#define hv_init_timer(timer, tick) \
-       wrmsrl(HV_X64_MSR_STIMER0_COUNT + (2*timer), tick)
-#define hv_init_timer_config(timer, val) \
-       wrmsrl(HV_X64_MSR_STIMER0_CONFIG + (2*timer), val)
-
-#define hv_get_simp(val) rdmsrl(HV_X64_MSR_SIMP, val)
-#define hv_set_simp(val) wrmsrl(HV_X64_MSR_SIMP, val)
-
-#define hv_get_siefp(val) rdmsrl(HV_X64_MSR_SIEFP, val)
-#define hv_set_siefp(val) wrmsrl(HV_X64_MSR_SIEFP, val)
-
-#define hv_get_synic_state(val) rdmsrl(HV_X64_MSR_SCONTROL, val)
-#define hv_set_synic_state(val) wrmsrl(HV_X64_MSR_SCONTROL, val)
-
-#define hv_get_vp_index(index) rdmsrl(HV_X64_MSR_VP_INDEX, index)
-
-#define hv_signal_eom() wrmsrl(HV_X64_MSR_EOM, 0)
-
-#define hv_get_synint_state(int_num, val) \
-       rdmsrl(HV_X64_MSR_SINT0 + int_num, val)
-#define hv_set_synint_state(int_num, val) \
-       wrmsrl(HV_X64_MSR_SINT0 + int_num, val)
-#define hv_recommend_using_aeoi() \
-       (!(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED))
+static inline void hv_set_register(unsigned int reg, u64 value)
+{
+       wrmsrl(reg, value);
+}
 
-#define hv_get_crash_ctl(val) \
-       rdmsrl(HV_X64_MSR_CRASH_CTL, val)
+static inline u64 hv_get_register(unsigned int reg)
+{
+       u64 value;
 
-#define hv_get_time_ref_count(val) \
-       rdmsrl(HV_X64_MSR_TIME_REF_COUNT, val)
+       rdmsrl(reg, value);
+       return value;
+}
 
-#define hv_get_reference_tsc(val) \
-       rdmsrl(HV_X64_MSR_REFERENCE_TSC, val)
-#define hv_set_reference_tsc(val) \
-       wrmsrl(HV_X64_MSR_REFERENCE_TSC, val)
-#define hv_set_clocksource_vdso(val) \
-       ((val).vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK)
-#define hv_enable_vdso_clocksource() \
-       vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK);
 #define hv_get_raw_timer() rdtsc_ordered()
-#define hv_get_vector() HYPERVISOR_CALLBACK_VECTOR
-
-/*
- * Reference to pv_ops must be inline so objtool
- * detection of noinstr violations can work correctly.
- */
-static __always_inline void hv_setup_sched_clock(void *sched_clock)
-{
-#ifdef CONFIG_PARAVIRT
-       paravirt_set_sched_clock(sched_clock);
-#endif
-}
 
 void hyperv_vector_handler(struct pt_regs *regs);
 
-static inline void hv_enable_stimer0_percpu_irq(int irq) {}
-static inline void hv_disable_stimer0_percpu_irq(int irq) {}
-
-
 #if IS_ENABLED(CONFIG_HYPERV)
 extern int hyperv_init_cpuhp;
 
@@ -189,38 +148,6 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
        return hv_status;
 }
 
-/*
- * Rep hypercalls. Callers of this functions are supposed to ensure that
- * rep_count and varhead_size comply with Hyper-V hypercall definition.
- */
-static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
-                                     void *input, void *output)
-{
-       u64 control = code;
-       u64 status;
-       u16 rep_comp;
-
-       control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET;
-       control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
-
-       do {
-               status = hv_do_hypercall(control, input, output);
-               if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
-                       return status;
-
-               /* Bits 32-43 of status have 'Reps completed' data. */
-               rep_comp = (status & HV_HYPERCALL_REP_COMP_MASK) >>
-                       HV_HYPERCALL_REP_COMP_OFFSET;
-
-               control &= ~HV_HYPERCALL_REP_START_MASK;
-               control |= (u64)rep_comp << HV_HYPERCALL_REP_START_OFFSET;
-
-               touch_nmi_watchdog();
-       } while (rep_comp < rep_count);
-
-       return status;
-}
-
 extern struct hv_vp_assist_page **hv_vp_assist_page;
 
 static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
@@ -233,9 +160,6 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
 
 void __init hyperv_init(void);
 void hyperv_setup_mmu_ops(void);
-void *hv_alloc_hyperv_page(void);
-void *hv_alloc_hyperv_zeroed_page(void);
-void hv_free_hyperv_page(unsigned long addr);
 void set_hv_tscchange_cb(void (*cb)(void));
 void clear_hv_tscchange_cb(void);
 void hyperv_stop_tsc_emulation(void);
@@ -272,8 +196,6 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
 #else /* CONFIG_HYPERV */
 static inline void hyperv_init(void) {}
 static inline void hyperv_setup_mmu_ops(void) {}
-static inline void *hv_alloc_hyperv_page(void) { return NULL; }
-static inline void hv_free_hyperv_page(unsigned long addr) {}
 static inline void set_hv_tscchange_cb(void (*cb)(void)) {}
 static inline void clear_hv_tscchange_cb(void) {}
 static inline void hyperv_stop_tsc_emulation(void) {};
index 415bc05..22f1334 100644 (file)
@@ -60,23 +60,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
        set_irq_regs(old_regs);
 }
 
-int hv_setup_vmbus_irq(int irq, void (*handler)(void))
+void hv_setup_vmbus_handler(void (*handler)(void))
 {
-       /*
-        * The 'irq' argument is ignored on x86/x64 because a hard-coded
-        * interrupt vector is used for Hyper-V interrupts.
-        */
        vmbus_handler = handler;
-       return 0;
 }
+EXPORT_SYMBOL_GPL(hv_setup_vmbus_handler);
 
-void hv_remove_vmbus_irq(void)
+void hv_remove_vmbus_handler(void)
 {
        /* We have no way to deallocate the interrupt gate */
        vmbus_handler = NULL;
 }
-EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
-EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
+EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
 
 /*
  * Routines to do per-architecture handling of stimer0
@@ -95,21 +90,17 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
        set_irq_regs(old_regs);
 }
 
-int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void))
+/* For x86/x64, override weak placeholders in hyperv_timer.c */
+void hv_setup_stimer0_handler(void (*handler)(void))
 {
-       *vector = HYPERV_STIMER0_VECTOR;
-       *irq = -1;   /* Unused on x86/x64 */
        hv_stimer0_handler = handler;
-       return 0;
 }
-EXPORT_SYMBOL_GPL(hv_setup_stimer0_irq);
 
-void hv_remove_stimer0_irq(int irq)
+void hv_remove_stimer0_handler(void)
 {
        /* We have no way to deallocate the interrupt gate */
        hv_stimer0_handler = NULL;
 }
-EXPORT_SYMBOL_GPL(hv_remove_stimer0_irq);
 
 void hv_setup_kexec_handler(void (*handler)(void))
 {
@@ -274,12 +265,13 @@ static void __init ms_hyperv_init_platform(void)
         * Extract the features and hints
         */
        ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
-       ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES);
+       ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
        ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
        ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
-       pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
-               ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
+       pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
+               ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
+               ms_hyperv.misc_features);
 
        ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
        ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
@@ -325,7 +317,7 @@ static void __init ms_hyperv_init_platform(void)
                x86_platform.calibrate_cpu = hv_get_tsc_khz;
        }
 
-       if (ms_hyperv.features_b & HV_ISOLATION) {
+       if (ms_hyperv.priv_high & HV_ISOLATION) {
                ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
                ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
 
index a02b0a2..977fd05 100644 (file)
@@ -18,6 +18,9 @@
 #include <linux/sched_clock.h>
 #include <linux/mm.h>
 #include <linux/cpuhotplug.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/acpi.h>
 #include <clocksource/hyperv_timer.h>
 #include <asm/hyperv-tlfs.h>
 #include <asm/mshyperv.h>
@@ -43,14 +46,13 @@ static u64 hv_sched_clock_offset __ro_after_init;
  */
 static bool direct_mode_enabled;
 
-static int stimer0_irq;
-static int stimer0_vector;
+static int stimer0_irq = -1;
 static int stimer0_message_sint;
+static DEFINE_PER_CPU(long, stimer0_evt);
 
 /*
- * ISR for when stimer0 is operating in Direct Mode.  Direct Mode
- * does not use VMbus or any VMbus messages, so process here and not
- * in the VMbus driver code.
+ * Common code for stimer0 interrupts coming via Direct Mode or
+ * as a VMbus message.
  */
 void hv_stimer0_isr(void)
 {
@@ -61,6 +63,16 @@ void hv_stimer0_isr(void)
 }
 EXPORT_SYMBOL_GPL(hv_stimer0_isr);
 
+/*
+ * stimer0 interrupt handler for architectures that support
+ * per-cpu interrupts, which also implies Direct Mode.
+ */
+static irqreturn_t hv_stimer0_percpu_isr(int irq, void *dev_id)
+{
+       hv_stimer0_isr();
+       return IRQ_HANDLED;
+}
+
 static int hv_ce_set_next_event(unsigned long delta,
                                struct clock_event_device *evt)
 {
@@ -68,16 +80,16 @@ static int hv_ce_set_next_event(unsigned long delta,
 
        current_tick = hv_read_reference_counter();
        current_tick += delta;
-       hv_init_timer(0, current_tick);
+       hv_set_register(HV_REGISTER_STIMER0_COUNT, current_tick);
        return 0;
 }
 
 static int hv_ce_shutdown(struct clock_event_device *evt)
 {
-       hv_init_timer(0, 0);
-       hv_init_timer_config(0, 0);
-       if (direct_mode_enabled)
-               hv_disable_stimer0_percpu_irq(stimer0_irq);
+       hv_set_register(HV_REGISTER_STIMER0_COUNT, 0);
+       hv_set_register(HV_REGISTER_STIMER0_CONFIG, 0);
+       if (direct_mode_enabled && stimer0_irq >= 0)
+               disable_percpu_irq(stimer0_irq);
 
        return 0;
 }
@@ -95,8 +107,9 @@ static int hv_ce_set_oneshot(struct clock_event_device *evt)
                 * on the specified hardware vector/IRQ.
                 */
                timer_cfg.direct_mode = 1;
-               timer_cfg.apic_vector = stimer0_vector;
-               hv_enable_stimer0_percpu_irq(stimer0_irq);
+               timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR;
+               if (stimer0_irq >= 0)
+                       enable_percpu_irq(stimer0_irq, IRQ_TYPE_NONE);
        } else {
                /*
                 * When it expires, the timer will generate a VMbus message,
@@ -105,7 +118,7 @@ static int hv_ce_set_oneshot(struct clock_event_device *evt)
                timer_cfg.direct_mode = 0;
                timer_cfg.sintx = stimer0_message_sint;
        }
-       hv_init_timer_config(0, timer_cfg.as_uint64);
+       hv_set_register(HV_REGISTER_STIMER0_CONFIG, timer_cfg.as_uint64);
        return 0;
 }
 
@@ -169,10 +182,58 @@ int hv_stimer_cleanup(unsigned int cpu)
 }
 EXPORT_SYMBOL_GPL(hv_stimer_cleanup);
 
+/*
+ * These placeholders are overridden by arch specific code on
+ * architectures that need special setup of the stimer0 IRQ because
+ * they don't support per-cpu IRQs (such as x86/x64).
+ */
+void __weak hv_setup_stimer0_handler(void (*handler)(void))
+{
+};
+
+void __weak hv_remove_stimer0_handler(void)
+{
+};
+
+/* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */
+static int hv_setup_stimer0_irq(void)
+{
+       int ret;
+
+       ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR,
+                       ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH);
+       if (ret < 0) {
+               pr_err("Can't register Hyper-V stimer0 GSI. Error %d", ret);
+               return ret;
+       }
+       stimer0_irq = ret;
+
+       ret = request_percpu_irq(stimer0_irq, hv_stimer0_percpu_isr,
+               "Hyper-V stimer0", &stimer0_evt);
+       if (ret) {
+               pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d",
+                       stimer0_irq, ret);
+               acpi_unregister_gsi(stimer0_irq);
+               stimer0_irq = -1;
+       }
+       return ret;
+}
+
+static void hv_remove_stimer0_irq(void)
+{
+       if (stimer0_irq == -1) {
+               hv_remove_stimer0_handler();
+       } else {
+               free_percpu_irq(stimer0_irq, &stimer0_evt);
+               acpi_unregister_gsi(stimer0_irq);
+               stimer0_irq = -1;
+       }
+}
+
 /* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */
-int hv_stimer_alloc(void)
+int hv_stimer_alloc(bool have_percpu_irqs)
 {
-       int ret = 0;
+       int ret;
 
        /*
         * Synthetic timers are always available except on old versions of
@@ -188,29 +249,37 @@ int hv_stimer_alloc(void)
 
        direct_mode_enabled = ms_hyperv.misc_features &
                        HV_STIMER_DIRECT_MODE_AVAILABLE;
-       if (direct_mode_enabled) {
-               ret = hv_setup_stimer0_irq(&stimer0_irq, &stimer0_vector,
-                               hv_stimer0_isr);
+
+       /*
+        * If Direct Mode isn't enabled, the remainder of the initialization
+        * is done later by hv_stimer_legacy_init()
+        */
+       if (!direct_mode_enabled)
+               return 0;
+
+       if (have_percpu_irqs) {
+               ret = hv_setup_stimer0_irq();
                if (ret)
-                       goto free_percpu;
+                       goto free_clock_event;
+       } else {
+               hv_setup_stimer0_handler(hv_stimer0_isr);
+       }
 
-               /*
-                * Since we are in Direct Mode, stimer initialization
-                * can be done now with a CPUHP value in the same range
-                * as other clockevent devices.
-                */
-               ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING,
-                               "clockevents/hyperv/stimer:starting",
-                               hv_stimer_init, hv_stimer_cleanup);
-               if (ret < 0)
-                       goto free_stimer0_irq;
+       /*
+        * Since we are in Direct Mode, stimer initialization
+        * can be done now with a CPUHP value in the same range
+        * as other clockevent devices.
+        */
+       ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING,
+                       "clockevents/hyperv/stimer:starting",
+                       hv_stimer_init, hv_stimer_cleanup);
+       if (ret < 0) {
+               hv_remove_stimer0_irq();
+               goto free_clock_event;
        }
        return ret;
 
-free_stimer0_irq:
-       hv_remove_stimer0_irq(stimer0_irq);
-       stimer0_irq = 0;
-free_percpu:
+free_clock_event:
        free_percpu(hv_clock_event);
        hv_clock_event = NULL;
        return ret;
@@ -254,23 +323,6 @@ void hv_stimer_legacy_cleanup(unsigned int cpu)
 }
 EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup);
 
-
-/* hv_stimer_free - Free global resources allocated by hv_stimer_alloc() */
-void hv_stimer_free(void)
-{
-       if (!hv_clock_event)
-               return;
-
-       if (direct_mode_enabled) {
-               cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING);
-               hv_remove_stimer0_irq(stimer0_irq);
-               stimer0_irq = 0;
-       }
-       free_percpu(hv_clock_event);
-       hv_clock_event = NULL;
-}
-EXPORT_SYMBOL_GPL(hv_stimer_free);
-
 /*
  * Do a global cleanup of clockevents for the cases of kexec and
  * vmbus exit
@@ -287,12 +339,17 @@ void hv_stimer_global_cleanup(void)
                hv_stimer_legacy_cleanup(cpu);
        }
 
-       /*
-        * If Direct Mode is enabled, the cpuhp teardown callback
-        * (hv_stimer_cleanup) will be run on all CPUs to stop the
-        * stimers.
-        */
-       hv_stimer_free();
+       if (!hv_clock_event)
+               return;
+
+       if (direct_mode_enabled) {
+               cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING);
+               hv_remove_stimer0_irq();
+               stimer0_irq = -1;
+       }
+       free_percpu(hv_clock_event);
+       hv_clock_event = NULL;
+
 }
 EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup);
 
@@ -302,14 +359,6 @@ EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup);
  * the other that uses the TSC reference page feature as defined in the
  * TLFS.  The MSR version is for compatibility with old versions of
  * Hyper-V and 32-bit x86.  The TSC reference page version is preferred.
- *
- * The Hyper-V clocksource ratings of 250 are chosen to be below the
- * TSC clocksource rating of 300.  In configurations where Hyper-V offers
- * an InvariantTSC, the TSC is not marked "unstable", so the TSC clocksource
- * is available and preferred.  With the higher rating, it will be the
- * default.  On older hardware and Hyper-V versions, the TSC is marked
- * "unstable", so no TSC clocksource is created and the selected Hyper-V
- * clocksource will be the default.
  */
 
 u64 (*hv_read_reference_counter)(void);
@@ -331,7 +380,7 @@ static u64 notrace read_hv_clock_tsc(void)
        u64 current_tick = hv_read_tsc_page(hv_get_tsc_page());
 
        if (current_tick == U64_MAX)
-               hv_get_time_ref_count(current_tick);
+               current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT);
 
        return current_tick;
 }
@@ -352,9 +401,9 @@ static void suspend_hv_clock_tsc(struct clocksource *arg)
        u64 tsc_msr;
 
        /* Disable the TSC page */
-       hv_get_reference_tsc(tsc_msr);
+       tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC);
        tsc_msr &= ~BIT_ULL(0);
-       hv_set_reference_tsc(tsc_msr);
+       hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr);
 }
 
 
@@ -364,39 +413,44 @@ static void resume_hv_clock_tsc(struct clocksource *arg)
        u64 tsc_msr;
 
        /* Re-enable the TSC page */
-       hv_get_reference_tsc(tsc_msr);
+       tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC);
        tsc_msr &= GENMASK_ULL(11, 0);
        tsc_msr |= BIT_ULL(0) | (u64)phys_addr;
-       hv_set_reference_tsc(tsc_msr);
+       hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr);
 }
 
+#ifdef VDSO_CLOCKMODE_HVCLOCK
 static int hv_cs_enable(struct clocksource *cs)
 {
-       hv_enable_vdso_clocksource();
+       vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK);
        return 0;
 }
+#endif
 
 static struct clocksource hyperv_cs_tsc = {
        .name   = "hyperv_clocksource_tsc_page",
-       .rating = 250,
+       .rating = 500,
        .read   = read_hv_clock_tsc_cs,
        .mask   = CLOCKSOURCE_MASK(64),
        .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
        .suspend= suspend_hv_clock_tsc,
        .resume = resume_hv_clock_tsc,
+#ifdef VDSO_CLOCKMODE_HVCLOCK
        .enable = hv_cs_enable,
+       .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK,
+#else
+       .vdso_clock_mode = VDSO_CLOCKMODE_NONE,
+#endif
 };
 
 static u64 notrace read_hv_clock_msr(void)
 {
-       u64 current_tick;
        /*
         * Read the partition counter to get the current tick count. This count
         * is set to 0 when the partition is created and is incremented in
         * 100 nanosecond units.
         */
-       hv_get_time_ref_count(current_tick);
-       return current_tick;
+       return hv_get_register(HV_REGISTER_TIME_REF_COUNT);
 }
 
 static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg)
@@ -412,12 +466,36 @@ static u64 notrace read_hv_sched_clock_msr(void)
 
 static struct clocksource hyperv_cs_msr = {
        .name   = "hyperv_clocksource_msr",
-       .rating = 250,
+       .rating = 500,
        .read   = read_hv_clock_msr_cs,
        .mask   = CLOCKSOURCE_MASK(64),
        .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
+/*
+ * Reference to pv_ops must be inline so objtool
+ * detection of noinstr violations can work correctly.
+ */
+#ifdef CONFIG_GENERIC_SCHED_CLOCK
+static __always_inline void hv_setup_sched_clock(void *sched_clock)
+{
+       /*
+        * We're on an architecture with generic sched clock (not x86/x64).
+        * The Hyper-V sched clock read function returns nanoseconds, not
+        * the normal 100ns units of the Hyper-V synthetic clock.
+        */
+       sched_clock_register(sched_clock, 64, NSEC_PER_SEC);
+}
+#elif defined CONFIG_PARAVIRT
+static __always_inline void hv_setup_sched_clock(void *sched_clock)
+{
+       /* We're on x86/x64 *and* using PV ops */
+       paravirt_set_sched_clock(sched_clock);
+}
+#else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */
+static __always_inline void hv_setup_sched_clock(void *sched_clock) {}
+#endif /* CONFIG_GENERIC_SCHED_CLOCK */
+
 static bool __init hv_init_tsc_clocksource(void)
 {
        u64             tsc_msr;
@@ -429,6 +507,22 @@ static bool __init hv_init_tsc_clocksource(void)
        if (hv_root_partition)
                return false;
 
+       /*
+        * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly
+        * handles frequency and offset changes due to live migration,
+        * pause/resume, and other VM management operations.  So lower the
+        * Hyper-V Reference TSC rating, causing the generic TSC to be used.
+        * TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference
+        * TSC will be preferred over the virtualized ARM64 arch counter.
+        * While the Hyper-V MSR clocksource won't be used since the
+        * Reference TSC clocksource is present, change its rating as
+        * well for consistency.
+        */
+       if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
+               hyperv_cs_tsc.rating = 250;
+               hyperv_cs_msr.rating = 250;
+       }
+
        hv_read_reference_counter = read_hv_clock_tsc;
        phys_addr = virt_to_phys(hv_get_tsc_page());
 
@@ -439,12 +533,11 @@ static bool __init hv_init_tsc_clocksource(void)
         * (which already has at least the low 12 bits set to zero since
         * it is page aligned). Also set the "enable" bit, which is bit 0.
         */
-       hv_get_reference_tsc(tsc_msr);
+       tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC);
        tsc_msr &= GENMASK_ULL(11, 0);
        tsc_msr = tsc_msr | 0x1 | (u64)phys_addr;
-       hv_set_reference_tsc(tsc_msr);
+       hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr);
 
-       hv_set_clocksource_vdso(hyperv_cs_tsc);
        clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
 
        hv_sched_clock_offset = hv_read_reference_counter();
index 79e5356..66c794d 100644 (file)
@@ -23,6 +23,7 @@ config HYPERV_UTILS
 config HYPERV_BALLOON
        tristate "Microsoft Hyper-V Balloon driver"
        depends on HYPERV
+       select PAGE_REPORTING
        help
          Select this option to enable Hyper-V Balloon driver.
 
index 0bd202d..c2635e9 100644 (file)
@@ -209,31 +209,96 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
 }
 EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request);
 
+static int send_modifychannel_without_ack(struct vmbus_channel *channel, u32 target_vp)
+{
+       struct vmbus_channel_modifychannel msg;
+       int ret;
+
+       memset(&msg, 0, sizeof(msg));
+       msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+       msg.child_relid = channel->offermsg.child_relid;
+       msg.target_vp = target_vp;
+
+       ret = vmbus_post_msg(&msg, sizeof(msg), true);
+       trace_vmbus_send_modifychannel(&msg, ret);
+
+       return ret;
+}
+
+static int send_modifychannel_with_ack(struct vmbus_channel *channel, u32 target_vp)
+{
+       struct vmbus_channel_modifychannel *msg;
+       struct vmbus_channel_msginfo *info;
+       unsigned long flags;
+       int ret;
+
+       info = kzalloc(sizeof(struct vmbus_channel_msginfo) +
+                               sizeof(struct vmbus_channel_modifychannel),
+                      GFP_KERNEL);
+       if (!info)
+               return -ENOMEM;
+
+       init_completion(&info->waitevent);
+       info->waiting_channel = channel;
+
+       msg = (struct vmbus_channel_modifychannel *)info->msg;
+       msg->header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+       msg->child_relid = channel->offermsg.child_relid;
+       msg->target_vp = target_vp;
+
+       spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+       list_add_tail(&info->msglistentry, &vmbus_connection.chn_msg_list);
+       spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+       ret = vmbus_post_msg(msg, sizeof(*msg), true);
+       trace_vmbus_send_modifychannel(msg, ret);
+       if (ret != 0) {
+               spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+               list_del(&info->msglistentry);
+               spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+               goto free_info;
+       }
+
+       /*
+        * Release channel_mutex; otherwise, vmbus_onoffer_rescind() could block on
+        * the mutex and be unable to signal the completion.
+        *
+        * See the caller target_cpu_store() for information about the usage of the
+        * mutex.
+        */
+       mutex_unlock(&vmbus_connection.channel_mutex);
+       wait_for_completion(&info->waitevent);
+       mutex_lock(&vmbus_connection.channel_mutex);
+
+       spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+       list_del(&info->msglistentry);
+       spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+       if (info->response.modify_response.status)
+               ret = -EAGAIN;
+
+free_info:
+       kfree(info);
+       return ret;
+}
+
 /*
  * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt.
  *
- * CHANNELMSG_MODIFYCHANNEL messages are aynchronous.  Also, Hyper-V does not
- * ACK such messages.  IOW we can't know when the host will stop interrupting
- * the "old" vCPU and start interrupting the "new" vCPU for the given channel.
+ * CHANNELMSG_MODIFYCHANNEL messages are aynchronous.  When VMbus version 5.3
+ * or later is negotiated, Hyper-V always sends an ACK in response to such a
+ * message.  For VMbus version 5.2 and earlier, it never sends an ACK.  With-
+ * out an ACK, we can not know when the host will stop interrupting the "old"
+ * vCPU and start interrupting the "new" vCPU for the given channel.
  *
  * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version
  * VERSION_WIN10_V4_1.
  */
-int vmbus_send_modifychannel(u32 child_relid, u32 target_vp)
+int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp)
 {
-       struct vmbus_channel_modifychannel conn_msg;
-       int ret;
-
-       memset(&conn_msg, 0, sizeof(conn_msg));
-       conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
-       conn_msg.child_relid = child_relid;
-       conn_msg.target_vp = target_vp;
-
-       ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true);
-
-       trace_vmbus_send_modifychannel(&conn_msg, ret);
-
-       return ret;
+       if (vmbus_proto_version >= VERSION_WIN10_V5_3)
+               return send_modifychannel_with_ack(channel, target_vp);
+       return send_modifychannel_without_ack(channel, target_vp);
 }
 EXPORT_SYMBOL_GPL(vmbus_send_modifychannel);
 
@@ -385,7 +450,7 @@ nomem:
  * @kbuffer: from kmalloc or vmalloc
  * @size: page-size multiple
  * @send_offset: the offset (in bytes) where the send ring buffer starts,
- *              should be 0 for BUFFER type gpadl
+ *              should be 0 for BUFFER type gpadl
  * @gpadl_handle: some funky thing
  */
 static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
@@ -653,7 +718,7 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
 
        if (newchannel->rescind) {
                err = -ENODEV;
-               goto error_free_info;
+               goto error_clean_msglist;
        }
 
        err = vmbus_post_msg(open_msg,
index f0ed730..caf6d0c 100644 (file)
@@ -333,7 +333,6 @@ fw_error:
        negop->icversion_data[1].minor = icmsg_minor;
        return found_match;
 }
-
 EXPORT_SYMBOL_GPL(vmbus_prep_negotiate_resp);
 
 /*
@@ -593,10 +592,10 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
         * CPUS_READ_UNLOCK             CPUS_WRITE_UNLOCK
         *
         * Forbids: CPU1's LOAD from *not* seing CPU2's STORE &&
-        *              CPU2's SEARCH from *not* seeing CPU1's INSERT
+        *              CPU2's SEARCH from *not* seeing CPU1's INSERT
         *
         * Forbids: CPU2's SEARCH from seeing CPU1's INSERT &&
-        *              CPU2's LOAD from *not* seing CPU1's STORE
+        *              CPU2's LOAD from *not* seing CPU1's STORE
         */
        cpus_read_lock();
 
@@ -756,6 +755,12 @@ static void init_vp_index(struct vmbus_channel *channel)
        free_cpumask_var(available_mask);
 }
 
+#define UNLOAD_DELAY_UNIT_MS   10              /* 10 milliseconds */
+#define UNLOAD_WAIT_MS         (100*1000)      /* 100 seconds */
+#define UNLOAD_WAIT_LOOPS      (UNLOAD_WAIT_MS/UNLOAD_DELAY_UNIT_MS)
+#define UNLOAD_MSG_MS          (5*1000)        /* Every 5 seconds */
+#define UNLOAD_MSG_LOOPS       (UNLOAD_MSG_MS/UNLOAD_DELAY_UNIT_MS)
+
 static void vmbus_wait_for_unload(void)
 {
        int cpu;
@@ -773,12 +778,17 @@ static void vmbus_wait_for_unload(void)
         * vmbus_connection.unload_event. If not, the last thing we can do is
         * read message pages for all CPUs directly.
         *
-        * Wait no more than 10 seconds so that the panic path can't get
-        * hung forever in case the response message isn't seen.
+        * Wait up to 100 seconds since an Azure host must writeback any dirty
+        * data in its disk cache before the VMbus UNLOAD request will
+        * complete. This flushing has been empirically observed to take up
+        * to 50 seconds in cases with a lot of dirty data, so allow additional
+        * leeway and for inaccuracies in mdelay(). But eventually time out so
+        * that the panic path can't get hung forever in case the response
+        * message isn't seen.
         */
-       for (i = 0; i < 1000; i++) {
+       for (i = 1; i <= UNLOAD_WAIT_LOOPS; i++) {
                if (completion_done(&vmbus_connection.unload_event))
-                       break;
+                       goto completed;
 
                for_each_online_cpu(cpu) {
                        struct hv_per_cpu_context *hv_cpu
@@ -801,9 +811,18 @@ static void vmbus_wait_for_unload(void)
                        vmbus_signal_eom(msg, message_type);
                }
 
-               mdelay(10);
+               /*
+                * Give a notice periodically so someone watching the
+                * serial output won't think it is completely hung.
+                */
+               if (!(i % UNLOAD_MSG_LOOPS))
+                       pr_notice("Waiting for VMBus UNLOAD to complete\n");
+
+               mdelay(UNLOAD_DELAY_UNIT_MS);
        }
+       pr_err("Continuing even though VMBus UNLOAD did not complete\n");
 
+completed:
        /*
         * We're crashing and already got the UNLOAD_RESPONSE, cleanup all
         * maybe-pending messages on all CPUs to be able to receive new
@@ -827,6 +846,11 @@ static void vmbus_unload_response(struct vmbus_channel_message_header *hdr)
        /*
         * This is a global event; just wakeup the waiting thread.
         * Once we successfully unload, we can cleanup the monitor state.
+        *
+        * NB.  A malicious or compromised Hyper-V could send a spurious
+        * message of type CHANNELMSG_UNLOAD_RESPONSE, and trigger a call
+        * of the complete() below.  Make sure that unload_event has been
+        * initialized by the time this complete() is executed.
         */
        complete(&vmbus_connection.unload_event);
 }
@@ -842,7 +866,7 @@ void vmbus_initiate_unload(bool crash)
        if (vmbus_proto_version < VERSION_WIN8_1)
                return;
 
-       init_completion(&vmbus_connection.unload_event);
+       reinit_completion(&vmbus_connection.unload_event);
        memset(&hdr, 0, sizeof(struct vmbus_channel_message_header));
        hdr.msgtype = CHANNELMSG_UNLOAD;
        vmbus_post_msg(&hdr, sizeof(struct vmbus_channel_message_header),
@@ -980,7 +1004,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
                 *                                      UNLOCK channel_mutex
                 *
                 * Forbids: r1 == valid_relid &&
-                *              channels[valid_relid] == channel
+                *              channels[valid_relid] == channel
                 *
                 * Note.  r1 can be INVALID_RELID only for an hv_sock channel.
                 * None of the hv_sock channels which were present before the
@@ -1313,6 +1337,46 @@ static void vmbus_ongpadl_created(struct vmbus_channel_message_header *hdr)
 }
 
 /*
+ * vmbus_onmodifychannel_response - Modify Channel response handler.
+ *
+ * This is invoked when we received a response to our channel modify request.
+ * Find the matching request, copy the response and signal the requesting thread.
+ */
+static void vmbus_onmodifychannel_response(struct vmbus_channel_message_header *hdr)
+{
+       struct vmbus_channel_modifychannel_response *response;
+       struct vmbus_channel_msginfo *msginfo;
+       unsigned long flags;
+
+       response = (struct vmbus_channel_modifychannel_response *)hdr;
+
+       trace_vmbus_onmodifychannel_response(response);
+
+       /*
+        * Find the modify msg, copy the response and signal/unblock the wait event.
+        */
+       spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+
+       list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, msglistentry) {
+               struct vmbus_channel_message_header *responseheader =
+                               (struct vmbus_channel_message_header *)msginfo->msg;
+
+               if (responseheader->msgtype == CHANNELMSG_MODIFYCHANNEL) {
+                       struct vmbus_channel_modifychannel *modifymsg;
+
+                       modifymsg = (struct vmbus_channel_modifychannel *)msginfo->msg;
+                       if (modifymsg->child_relid == response->child_relid) {
+                               memcpy(&msginfo->response.modify_response, response,
+                                      sizeof(*response));
+                               complete(&msginfo->waitevent);
+                               break;
+                       }
+               }
+       }
+       spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+}
+
+/*
  * vmbus_ongpadl_torndown - GPADL torndown handler.
  *
  * This is invoked when we received a response to our gpadl teardown request.
@@ -1429,6 +1493,8 @@ channel_message_table[CHANNELMSG_COUNT] = {
        { CHANNELMSG_TL_CONNECT_REQUEST,        0, NULL, 0},
        { CHANNELMSG_MODIFYCHANNEL,             0, NULL, 0},
        { CHANNELMSG_TL_CONNECT_RESULT,         0, NULL, 0},
+       { CHANNELMSG_MODIFYCHANNEL_RESPONSE,    1, vmbus_onmodifychannel_response,
+               sizeof(struct vmbus_channel_modifychannel_response)},
 };
 
 /*
index c83612c..311cd00 100644 (file)
 
 struct vmbus_connection vmbus_connection = {
        .conn_state             = DISCONNECTED,
+       .unload_event           = COMPLETION_INITIALIZER(
+                                 vmbus_connection.unload_event),
        .next_gpadl_handle      = ATOMIC_INIT(0xE1E10),
 
-       .ready_for_suspend_event= COMPLETION_INITIALIZER(
+       .ready_for_suspend_event = COMPLETION_INITIALIZER(
                                  vmbus_connection.ready_for_suspend_event),
        .ready_for_resume_event = COMPLETION_INITIALIZER(
                                  vmbus_connection.ready_for_resume_event),
@@ -45,6 +47,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version);
  * Table of VMBus versions listed from newest to oldest.
  */
 static __u32 vmbus_versions[] = {
+       VERSION_WIN10_V5_3,
        VERSION_WIN10_V5_2,
        VERSION_WIN10_V5_1,
        VERSION_WIN10_V5,
@@ -60,7 +63,7 @@ static __u32 vmbus_versions[] = {
  * Maximal VMBus protocol version guests can negotiate.  Useful to cap the
  * VMBus version for testing and debugging purpose.
  */
-static uint max_version = VERSION_WIN10_V5_2;
+static uint max_version = VERSION_WIN10_V5_3;
 
 module_param(max_version, uint, S_IRUGO);
 MODULE_PARM_DESC(max_version,
index f202ac7..e83507f 100644 (file)
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/hyperv.h>
-#include <linux/version.h>
 #include <linux/random.h>
 #include <linux/clockchips.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
 #include <clocksource/hyperv_timer.h>
 #include <asm/mshyperv.h>
 #include "hyperv_vmbus.h"
@@ -37,6 +38,42 @@ int hv_init(void)
 }
 
 /*
+ * Functions for allocating and freeing memory with size and
+ * alignment HV_HYP_PAGE_SIZE. These functions are needed because
+ * the guest page size may not be the same as the Hyper-V page
+ * size. We depend upon kmalloc() aligning power-of-two size
+ * allocations to the allocation size boundary, so that the
+ * allocated memory appears to Hyper-V as a page of the size
+ * it expects.
+ */
+
+void *hv_alloc_hyperv_page(void)
+{
+       BUILD_BUG_ON(PAGE_SIZE <  HV_HYP_PAGE_SIZE);
+
+       if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
+               return (void *)__get_free_page(GFP_KERNEL);
+       else
+               return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+}
+
+void *hv_alloc_hyperv_zeroed_page(void)
+{
+       if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
+               return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+       else
+               return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+}
+
+void hv_free_hyperv_page(unsigned long addr)
+{
+       if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
+               free_page(addr);
+       else
+               kfree((void *)addr);
+}
+
+/*
  * hv_post_message - Post a message using the hypervisor message IPC.
  *
  * This involves a hypercall.
@@ -68,7 +105,7 @@ int hv_post_message(union hv_connection_id connection_id,
         */
        put_cpu_ptr(hv_cpu);
 
-       return status & 0xFFFF;
+       return hv_result(status);
 }
 
 int hv_synic_alloc(void)
@@ -162,34 +199,48 @@ void hv_synic_enable_regs(unsigned int cpu)
        union hv_synic_scontrol sctrl;
 
        /* Setup the Synic's message page */
-       hv_get_simp(simp.as_uint64);
+       simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP);
        simp.simp_enabled = 1;
        simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
                >> HV_HYP_PAGE_SHIFT;
 
-       hv_set_simp(simp.as_uint64);
+       hv_set_register(HV_REGISTER_SIMP, simp.as_uint64);
 
        /* Setup the Synic's event page */
-       hv_get_siefp(siefp.as_uint64);
+       siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
        siefp.siefp_enabled = 1;
        siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
                >> HV_HYP_PAGE_SHIFT;
 
-       hv_set_siefp(siefp.as_uint64);
+       hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64);
 
        /* Setup the shared SINT. */
-       hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+       if (vmbus_irq != -1)
+               enable_percpu_irq(vmbus_irq, 0);
+       shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 +
+                                       VMBUS_MESSAGE_SINT);
 
-       shared_sint.vector = hv_get_vector();
+       shared_sint.vector = vmbus_interrupt;
        shared_sint.masked = false;
-       shared_sint.auto_eoi = hv_recommend_using_aeoi();
-       hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+
+       /*
+        * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64),
+        * it doesn't provide a recommendation flag and AEOI must be disabled.
+        */
+#ifdef HV_DEPRECATING_AEOI_RECOMMENDED
+       shared_sint.auto_eoi =
+                       !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED);
+#else
+       shared_sint.auto_eoi = 0;
+#endif
+       hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT,
+                               shared_sint.as_uint64);
 
        /* Enable the global synic bit */
-       hv_get_synic_state(sctrl.as_uint64);
+       sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL);
        sctrl.enable = 1;
 
-       hv_set_synic_state(sctrl.as_uint64);
+       hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64);
 }
 
 int hv_synic_init(unsigned int cpu)
@@ -211,30 +262,71 @@ void hv_synic_disable_regs(unsigned int cpu)
        union hv_synic_siefp siefp;
        union hv_synic_scontrol sctrl;
 
-       hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+       shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 +
+                                       VMBUS_MESSAGE_SINT);
 
        shared_sint.masked = 1;
 
        /* Need to correctly cleanup in the case of SMP!!! */
        /* Disable the interrupt */
-       hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+       hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT,
+                               shared_sint.as_uint64);
 
-       hv_get_simp(simp.as_uint64);
+       simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP);
        simp.simp_enabled = 0;
        simp.base_simp_gpa = 0;
 
-       hv_set_simp(simp.as_uint64);
+       hv_set_register(HV_REGISTER_SIMP, simp.as_uint64);
 
-       hv_get_siefp(siefp.as_uint64);
+       siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
        siefp.siefp_enabled = 0;
        siefp.base_siefp_gpa = 0;
 
-       hv_set_siefp(siefp.as_uint64);
+       hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64);
 
        /* Disable the global synic bit */
-       hv_get_synic_state(sctrl.as_uint64);
+       sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL);
        sctrl.enable = 0;
-       hv_set_synic_state(sctrl.as_uint64);
+       hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64);
+
+       if (vmbus_irq != -1)
+               disable_percpu_irq(vmbus_irq);
+}
+
+#define HV_MAX_TRIES 3
+/*
+ * Scan the event flags page of 'this' CPU looking for any bit that is set.  If we find one
+ * bit set, then wait for a few milliseconds.  Repeat these steps for a maximum of 3 times.
+ * Return 'true', if there is still any set bit after this operation; 'false', otherwise.
+ *
+ * If a bit is set, that means there is a pending channel interrupt.  The expectation is
+ * that the normal interrupt handling mechanism will find and process the channel interrupt
+ * "very soon", and in the process clear the bit.
+ */
+static bool hv_synic_event_pending(void)
+{
+       struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
+       union hv_synic_event_flags *event =
+               (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT;
+       unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
+       bool pending;
+       u32 relid;
+       int tries = 0;
+
+retry:
+       pending = false;
+       for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
+               /* Special case - VMBus channel protocol messages */
+               if (relid == 0)
+                       continue;
+               pending = true;
+               break;
+       }
+       if (pending && tries++ < HV_MAX_TRIES) {
+               usleep_range(10000, 20000);
+               goto retry;
+       }
+       return pending;
 }
 
 int hv_synic_cleanup(unsigned int cpu)
@@ -242,6 +334,9 @@ int hv_synic_cleanup(unsigned int cpu)
        struct vmbus_channel *channel, *sc;
        bool channel_found = false;
 
+       if (vmbus_connection.conn_state != CONNECTED)
+               goto always_cleanup;
+
        /*
         * Hyper-V does not provide a way to change the connect CPU once
         * it is set; we must prevent the connect CPU from going offline
@@ -249,8 +344,7 @@ int hv_synic_cleanup(unsigned int cpu)
         * path where the vmbus is already disconnected, the CPU must be
         * allowed to shut down.
         */
-       if (cpu == VMBUS_CONNECT_CPU &&
-           vmbus_connection.conn_state == CONNECTED)
+       if (cpu == VMBUS_CONNECT_CPU)
                return -EBUSY;
 
        /*
@@ -277,9 +371,21 @@ int hv_synic_cleanup(unsigned int cpu)
        }
        mutex_unlock(&vmbus_connection.channel_mutex);
 
-       if (channel_found && vmbus_connection.conn_state == CONNECTED)
+       if (channel_found)
+               return -EBUSY;
+
+       /*
+        * channel_found == false means that any channels that were previously
+        * assigned to the CPU have been reassigned elsewhere with a call of
+        * vmbus_send_modifychannel().  Scan the event flags page looking for
+        * bits that are set and waiting with a timeout for vmbus_chan_sched()
+        * to process such bits.  If bits are still set after this operation
+        * and VMBus is connected, fail the CPU offlining operation.
+        */
+       if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending())
                return -EBUSY;
 
+always_cleanup:
        hv_stimer_legacy_cleanup(cpu);
 
        hv_synic_disable_regs(cpu);
index 2f776d7..58af84e 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/memory.h>
 #include <linux/notifier.h>
 #include <linux/percpu_counter.h>
+#include <linux/page_reporting.h>
 
 #include <linux/hyperv.h>
 #include <asm/hyperv-tlfs.h>
@@ -563,6 +564,8 @@ struct hv_dynmem_device {
         * The negotiated version agreed by host.
         */
        __u32 version;
+
+       struct page_reporting_dev_info pr_dev_info;
 };
 
 static struct hv_dynmem_device dm_device;
@@ -1568,6 +1571,89 @@ static void balloon_onchannelcallback(void *context)
 
 }
 
+/* Hyper-V only supports reporting 2MB pages or higher */
+#define HV_MIN_PAGE_REPORTING_ORDER    9
+#define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << HV_MIN_PAGE_REPORTING_ORDER)
+static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
+                   struct scatterlist *sgl, unsigned int nents)
+{
+       unsigned long flags;
+       struct hv_memory_hint *hint;
+       int i;
+       u64 status;
+       struct scatterlist *sg;
+
+       WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
+       WARN_ON_ONCE(sgl->length < HV_MIN_PAGE_REPORTING_LEN);
+       local_irq_save(flags);
+       hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
+       if (!hint) {
+               local_irq_restore(flags);
+               return -ENOSPC;
+       }
+
+       hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
+       hint->reserved = 0;
+       for_each_sg(sgl, sg, nents, i) {
+               union hv_gpa_page_range *range;
+
+               range = &hint->ranges[i];
+               range->address_space = 0;
+               /* page reporting only reports 2MB pages or higher */
+               range->page.largepage = 1;
+               range->page.additional_pages =
+                       (sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1;
+               range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB;
+               range->base_large_pfn =
+                       page_to_hvpfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER;
+       }
+
+       status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
+                                    hint, NULL);
+       local_irq_restore(flags);
+       if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+               pr_err("Cold memory discard hypercall failed with status %llx\n",
+                       status);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void enable_page_reporting(void)
+{
+       int ret;
+
+       /* Essentially, validating 'PAGE_REPORTING_MIN_ORDER' is big enough. */
+       if (pageblock_order < HV_MIN_PAGE_REPORTING_ORDER) {
+               pr_debug("Cold memory discard is only supported on 2MB pages and above\n");
+               return;
+       }
+
+       if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
+               pr_debug("Cold memory discard hint not supported by Hyper-V\n");
+               return;
+       }
+
+       BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
+       dm_device.pr_dev_info.report = hv_free_page_report;
+       ret = page_reporting_register(&dm_device.pr_dev_info);
+       if (ret < 0) {
+               dm_device.pr_dev_info.report = NULL;
+               pr_err("Failed to enable cold memory discard: %d\n", ret);
+       } else {
+               pr_info("Cold memory discard hint enabled\n");
+       }
+}
+
+static void disable_page_reporting(void)
+{
+       if (dm_device.pr_dev_info.report) {
+               page_reporting_unregister(&dm_device.pr_dev_info);
+               dm_device.pr_dev_info.report = NULL;
+       }
+}
+
 static int balloon_connect_vsp(struct hv_device *dev)
 {
        struct dm_version_request version_req;
@@ -1713,6 +1799,7 @@ static int balloon_probe(struct hv_device *dev,
        if (ret != 0)
                return ret;
 
+       enable_page_reporting();
        dm_device.state = DM_INITIALIZED;
 
        dm_device.thread =
@@ -1727,6 +1814,7 @@ static int balloon_probe(struct hv_device *dev,
 probe_error:
        dm_device.state = DM_INIT_ERROR;
        dm_device.thread  = NULL;
+       disable_page_reporting();
        vmbus_close(dev->channel);
 #ifdef CONFIG_MEMORY_HOTPLUG
        unregister_memory_notifier(&hv_memory_nb);
@@ -1749,6 +1837,7 @@ static int balloon_remove(struct hv_device *dev)
        cancel_work_sync(&dm->ha_wrk.wrk);
 
        kthread_stop(dm->thread);
+       disable_page_reporting();
        vmbus_close(dev->channel);
 #ifdef CONFIG_MEMORY_HOTPLUG
        unregister_memory_notifier(&hv_memory_nb);
index 6063bb2..c02a171 100644 (file)
@@ -103,6 +103,21 @@ TRACE_EVENT(vmbus_ongpadl_created,
                    )
        );
 
+TRACE_EVENT(vmbus_onmodifychannel_response,
+           TP_PROTO(const struct vmbus_channel_modifychannel_response *response),
+           TP_ARGS(response),
+           TP_STRUCT__entry(
+                   __field(u32, child_relid)
+                   __field(u32, status)
+                   ),
+           TP_fast_assign(__entry->child_relid = response->child_relid;
+                          __entry->status = response->status;
+                   ),
+           TP_printk("child_relid 0x%x, status %d",
+                     __entry->child_relid,  __entry->status
+                   )
+       );
+
 TRACE_EVENT(vmbus_ongpadl_torndown,
            TP_PROTO(const struct vmbus_channel_gpadl_torndown *gpadltorndown),
            TP_ARGS(gpadltorndown),
index 35833d4..374f8af 100644 (file)
@@ -84,15 +84,6 @@ hv_set_next_write_location(struct hv_ring_buffer_info *ring_info,
        ring_info->ring_buffer->write_index = next_write_location;
 }
 
-/* Set the next read location for the specified ring buffer. */
-static inline void
-hv_set_next_read_location(struct hv_ring_buffer_info *ring_info,
-                   u32 next_read_location)
-{
-       ring_info->ring_buffer->read_index = next_read_location;
-       ring_info->priv_read_index = next_read_location;
-}
-
 /* Get the size of the ring buffer. */
 static inline u32
 hv_get_ring_buffersize(const struct hv_ring_buffer_info *ring_info)
@@ -313,7 +304,6 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
                rqst_id = vmbus_next_request_id(&channel->requestor, requestid);
                if (rqst_id == VMBUS_RQST_ERROR) {
                        spin_unlock_irqrestore(&outring_info->ring_lock, flags);
-                       pr_err("No request id available\n");
                        return -EAGAIN;
                }
        }
index 10dce9f..b12d682 100644 (file)
@@ -48,8 +48,10 @@ static int hyperv_cpuhp_online;
 
 static void *hv_panic_page;
 
+static long __percpu *vmbus_evt;
+
 /* Values parsed from ACPI DSDT */
-static int vmbus_irq;
+int vmbus_irq;
 int vmbus_interrupt;
 
 /*
@@ -1381,7 +1383,13 @@ static void vmbus_isr(void)
                        tasklet_schedule(&hv_cpu->msg_dpc);
        }
 
-       add_interrupt_randomness(hv_get_vector(), 0);
+       add_interrupt_randomness(vmbus_interrupt, 0);
+}
+
+static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
+{
+       vmbus_isr();
+       return IRQ_HANDLED;
 }
 
 /*
@@ -1392,22 +1400,36 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper,
                         enum kmsg_dump_reason reason)
 {
        size_t bytes_written;
-       phys_addr_t panic_pa;
 
        /* We are only interested in panics. */
        if ((reason != KMSG_DUMP_PANIC) || (!sysctl_record_panic_msg))
                return;
 
-       panic_pa = virt_to_phys(hv_panic_page);
-
        /*
         * Write dump contents to the page. No need to synchronize; panic should
         * be single-threaded.
         */
        kmsg_dump_get_buffer(dumper, false, hv_panic_page, HV_HYP_PAGE_SIZE,
                             &bytes_written);
-       if (bytes_written)
-               hyperv_report_panic_msg(panic_pa, bytes_written);
+       if (!bytes_written)
+               return;
+       /*
+        * P3 to contain the physical address of the panic page & P4 to
+        * contain the size of the panic data in that page. Rest of the
+        * registers are no-op when the NOTIFY_MSG flag is set.
+        */
+       hv_set_register(HV_REGISTER_CRASH_P0, 0);
+       hv_set_register(HV_REGISTER_CRASH_P1, 0);
+       hv_set_register(HV_REGISTER_CRASH_P2, 0);
+       hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page));
+       hv_set_register(HV_REGISTER_CRASH_P4, bytes_written);
+
+       /*
+        * Let Hyper-V know there is crash data available along with
+        * the panic message.
+        */
+       hv_set_register(HV_REGISTER_CRASH_CTL,
+              (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG));
 }
 
 static struct kmsg_dumper hv_kmsg_dumper = {
@@ -1482,9 +1504,28 @@ static int vmbus_bus_init(void)
        if (ret)
                return ret;
 
-       ret = hv_setup_vmbus_irq(vmbus_irq, vmbus_isr);
-       if (ret)
-               goto err_setup;
+       /*
+        * VMbus interrupts are best modeled as per-cpu interrupts. If
+        * on an architecture with support for per-cpu IRQs (e.g. ARM64),
+        * allocate a per-cpu IRQ using standard Linux kernel functionality.
+        * If not on such an architecture (e.g., x86/x64), then rely on
+        * code in the arch-specific portion of the code tree to connect
+        * the VMbus interrupt handler.
+        */
+
+       if (vmbus_irq == -1) {
+               hv_setup_vmbus_handler(vmbus_isr);
+       } else {
+               vmbus_evt = alloc_percpu(long);
+               ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr,
+                               "Hyper-V VMbus", vmbus_evt);
+               if (ret) {
+                       pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d",
+                                       vmbus_irq, ret);
+                       free_percpu(vmbus_evt);
+                       goto err_setup;
+               }
+       }
 
        ret = hv_synic_alloc();
        if (ret)
@@ -1521,7 +1562,7 @@ static int vmbus_bus_init(void)
                 * Register for panic kmsg callback only if the right
                 * capability is supported by the hypervisor.
                 */
-               hv_get_crash_ctl(hyperv_crash_ctl);
+               hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL);
                if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG)
                        hv_kmsg_dump_register();
 
@@ -1545,7 +1586,12 @@ err_connect:
 err_cpuhp:
        hv_synic_free();
 err_alloc:
-       hv_remove_vmbus_irq();
+       if (vmbus_irq == -1) {
+               hv_remove_vmbus_handler();
+       } else {
+               free_percpu_irq(vmbus_irq, vmbus_evt);
+               free_percpu(vmbus_evt);
+       }
 err_setup:
        bus_unregister(&hv_bus);
        unregister_sysctl_table(hv_ctl_table_hdr);
@@ -1802,13 +1848,15 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
        if (target_cpu == origin_cpu)
                goto cpu_store_unlock;
 
-       if (vmbus_send_modifychannel(channel->offermsg.child_relid,
+       if (vmbus_send_modifychannel(channel,
                                     hv_cpu_number_to_vp_number(target_cpu))) {
                ret = -EIO;
                goto cpu_store_unlock;
        }
 
        /*
+        * For version before VERSION_WIN10_V5_3, the following warning holds:
+        *
         * Warning.  At this point, there is *no* guarantee that the host will
         * have successfully processed the vmbus_send_modifychannel() request.
         * See the header comment of vmbus_send_modifychannel() for more info.
@@ -2663,6 +2711,18 @@ static int __init hv_acpi_init(void)
                ret = -ETIMEDOUT;
                goto cleanup;
        }
+
+       /*
+        * If we're on an architecture with a hardcoded hypervisor
+        * vector (i.e. x86/x64), override the VMbus interrupt found
+        * in the ACPI tables. Ensure vmbus_irq is not set since the
+        * normal Linux IRQ mechanism is not used in this case.
+        */
+#ifdef HYPERVISOR_CALLBACK_VECTOR
+       vmbus_interrupt = HYPERVISOR_CALLBACK_VECTOR;
+       vmbus_irq = -1;
+#endif
+
        hv_debug_init();
 
        ret = vmbus_bus_init();
@@ -2693,7 +2753,12 @@ static void __exit vmbus_exit(void)
        vmbus_connection.conn_state = DISCONNECTED;
        hv_stimer_global_cleanup();
        vmbus_disconnect();
-       hv_remove_vmbus_irq();
+       if (vmbus_irq == -1) {
+               hv_remove_vmbus_handler();
+       } else {
+               free_percpu_irq(vmbus_irq, vmbus_evt);
+               free_percpu(vmbus_evt);
+       }
        for_each_online_cpu(cpu) {
                struct hv_per_cpu_context *hv_cpu
                        = per_cpu_ptr(hv_context.cpu_context, cpu);
index a313708..1ff4ce2 100644 (file)
@@ -1292,7 +1292,7 @@ exit_unlock:
         * resumes, hv_pci_restore_msi_state() is able to correctly restore
         * the interrupt with the correct affinity.
         */
-       if (res && hbus->state != hv_pcibus_removing)
+       if (!hv_result_success(res) && hbus->state != hv_pcibus_removing)
                dev_err(&hbus->hdev->device,
                        "%s() failed: %#llx", __func__, res);
 
index 4dc9077..a7e6eea 100644 (file)
@@ -308,7 +308,7 @@ static inline int synthvid_send(struct hv_device *hdev,
                               VM_PKT_DATA_INBAND, 0);
 
        if (ret)
-               pr_err("Unable to send packet via vmbus\n");
+               pr_err_ratelimited("Unable to send packet via vmbus; error %d\n", ret);
 
        return ret;
 }
index 83448e8..515c3fb 100644 (file)
@@ -89,9 +89,9 @@
 #define HV_ACCESS_STATS                                BIT(8)
 #define HV_DEBUGGING                           BIT(11)
 #define HV_CPU_MANAGEMENT                      BIT(12)
+#define HV_ENABLE_EXTENDED_HYPERCALLS          BIT(20)
 #define HV_ISOLATION                           BIT(22)
 
-
 /*
  * TSC page layout.
  */
@@ -159,11 +159,18 @@ struct ms_hyperv_tsc_page {
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
 
+/* Extended hypercalls */
+#define HV_EXT_CALL_QUERY_CAPABILITIES         0x8001
+#define HV_EXT_CALL_MEMORY_HEAT_HINT           0x8003
+
 #define HV_FLUSH_ALL_PROCESSORS                        BIT(0)
 #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES    BIT(1)
 #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY      BIT(2)
 #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT     BIT(3)
 
+/* Extended capability bits */
+#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8)
+
 enum HV_GENERIC_SET_FORMAT {
        HV_GENERIC_SET_SPARSE_4K,
        HV_GENERIC_SET_ALL,
@@ -220,6 +227,41 @@ enum HV_GENERIC_SET_FORMAT {
 #define HV_MESSAGE_PAYLOAD_BYTE_COUNT  (240)
 #define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30)
 
+/*
+ * Define hypervisor message types. Some of the message types
+ * are x86/x64 specific, but there's no good way to separate
+ * them out into the arch-specific version of hyperv-tlfs.h
+ * because C doesn't provide a way to extend enum types.
+ * Keeping them all in the arch neutral hyperv-tlfs.h seems
+ * the least messy compromise.
+ */
+enum hv_message_type {
+       HVMSG_NONE                      = 0x00000000,
+
+       /* Memory access messages. */
+       HVMSG_UNMAPPED_GPA              = 0x80000000,
+       HVMSG_GPA_INTERCEPT             = 0x80000001,
+
+       /* Timer notification messages. */
+       HVMSG_TIMER_EXPIRED             = 0x80000010,
+
+       /* Error messages. */
+       HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020,
+       HVMSG_UNRECOVERABLE_EXCEPTION   = 0x80000021,
+       HVMSG_UNSUPPORTED_FEATURE       = 0x80000022,
+
+       /* Trace buffer complete messages. */
+       HVMSG_EVENTLOG_BUFFERCOMPLETE   = 0x80000040,
+
+       /* Platform-specific processor intercept messages. */
+       HVMSG_X64_IOPORT_INTERCEPT      = 0x80010000,
+       HVMSG_X64_MSR_INTERCEPT         = 0x80010001,
+       HVMSG_X64_CPUID_INTERCEPT       = 0x80010002,
+       HVMSG_X64_EXCEPTION_INTERCEPT   = 0x80010003,
+       HVMSG_X64_APIC_EOI              = 0x80010004,
+       HVMSG_X64_LEGACY_FP_ERROR       = 0x80010005
+};
+
 /* Define synthetic interrupt controller message flags. */
 union hv_message_flags {
        __u8 asu8;
@@ -373,8 +415,10 @@ struct hv_guest_mapping_flush {
  *  by the bitwidth of "additional_pages" in union hv_gpa_page_range.
  */
 #define HV_MAX_FLUSH_PAGES (2048)
+#define HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB                0
+#define HV_GPA_PAGE_RANGE_PAGE_SIZE_1GB                1
 
-/* HvFlushGuestPhysicalAddressList hypercall */
+/* HvFlushGuestPhysicalAddressList, HvExtCallMemoryHeatHint hypercall */
 union hv_gpa_page_range {
        u64 address_space;
        struct {
@@ -382,6 +426,12 @@ union hv_gpa_page_range {
                u64 largepage:1;
                u64 basepfn:52;
        } page;
+       struct {
+               u64 reserved:12;
+               u64 page_size:1;
+               u64 reserved1:8;
+               u64 base_large_pfn:43;
+       };
 };
 
 /*
@@ -739,4 +789,20 @@ struct hv_input_unmap_device_interrupt {
 #define HV_SOURCE_SHADOW_NONE               0x0
 #define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE   0x1
 
+/*
+ * The whole argument should fit in a page to be able to pass to the hypervisor
+ * in one hypercall.
+ */
+#define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES  \
+       ((HV_HYP_PAGE_SIZE - sizeof(struct hv_memory_hint)) / \
+               sizeof(union hv_gpa_page_range))
+
+/* HvExtCallMemoryHeatHint hypercall */
+#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD      2
+struct hv_memory_hint {
+       u64 type:2;
+       u64 reserved:62;
+       union hv_gpa_page_range ranges[];
+} __packed;
+
 #endif
index dff58a3..9a000ba 100644 (file)
@@ -27,7 +27,7 @@
 
 struct ms_hyperv_info {
        u32 features;
-       u32 features_b;
+       u32 priv_high;
        u32 misc_features;
        u32 hints;
        u32 nested_features;
@@ -41,6 +41,53 @@ extern struct ms_hyperv_info ms_hyperv;
 extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr);
 extern u64 hv_do_fast_hypercall8(u16 control, u64 input8);
 
+/* Helper functions that provide a consistent pattern for checking Hyper-V hypercall status. */
+static inline int hv_result(u64 status)
+{
+       return status & HV_HYPERCALL_RESULT_MASK;
+}
+
+static inline bool hv_result_success(u64 status)
+{
+       return hv_result(status) == HV_STATUS_SUCCESS;
+}
+
+static inline unsigned int hv_repcomp(u64 status)
+{
+       /* Bits [43:32] of status have 'Reps completed' data. */
+       return (status & HV_HYPERCALL_REP_COMP_MASK) >>
+                        HV_HYPERCALL_REP_COMP_OFFSET;
+}
+
+/*
+ * Rep hypercalls. Callers of this functions are supposed to ensure that
+ * rep_count and varhead_size comply with Hyper-V hypercall definition.
+ */
+static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
+                                     void *input, void *output)
+{
+       u64 control = code;
+       u64 status;
+       u16 rep_comp;
+
+       control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET;
+       control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
+
+       do {
+               status = hv_do_hypercall(control, input, output);
+               if (!hv_result_success(status))
+                       return status;
+
+               rep_comp = hv_repcomp(status);
+
+               control &= ~HV_HYPERCALL_REP_START_MASK;
+               control |= (u64)rep_comp << HV_HYPERCALL_REP_START_OFFSET;
+
+               touch_nmi_watchdog();
+       } while (rep_comp < rep_count);
+
+       return status;
+}
 
 /* Generate the guest OS identifier as described in the Hyper-V TLFS */
 static inline  __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version,
@@ -56,7 +103,6 @@ static inline  __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version,
        return guest_id;
 }
 
-
 /* Free the message slot and signal end-of-message if required */
 static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
 {
@@ -88,14 +134,14 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
                 * possibly deliver another msg from the
                 * hypervisor
                 */
-               hv_signal_eom();
+               hv_set_register(HV_REGISTER_EOM, 0);
        }
 }
 
-int hv_setup_vmbus_irq(int irq, void (*handler)(void));
-void hv_remove_vmbus_irq(void);
-void hv_enable_vmbus_irq(void);
-void hv_disable_vmbus_irq(void);
+void hv_setup_vmbus_handler(void (*handler)(void));
+void hv_remove_vmbus_handler(void);
+void hv_setup_stimer0_handler(void (*handler)(void));
+void hv_remove_stimer0_handler(void);
 
 void hv_setup_kexec_handler(void (*handler)(void));
 void hv_remove_kexec_handler(void);
@@ -103,6 +149,7 @@ void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs));
 void hv_remove_crash_handler(void);
 
 extern int vmbus_interrupt;
+extern int vmbus_irq;
 
 #if IS_ENABLED(CONFIG_HYPERV)
 /*
@@ -117,6 +164,10 @@ extern u32 hv_max_vp_index;
 /* Sentinel value for an uninitialized entry in hv_vp_index array */
 #define VP_INVAL       U32_MAX
 
+void *hv_alloc_hyperv_page(void);
+void *hv_alloc_hyperv_zeroed_page(void);
+void hv_free_hyperv_page(unsigned long addr);
+
 /**
  * hv_cpu_number_to_vp_number() - Map CPU to VP.
  * @cpu_number: CPU number in Linux terms
@@ -169,21 +220,16 @@ static inline int cpumask_to_vpset(struct hv_vpset *vpset,
 }
 
 void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die);
-void hyperv_report_panic_msg(phys_addr_t pa, size_t size);
 bool hv_is_hyperv_initialized(void);
 bool hv_is_hibernation_supported(void);
 enum hv_isolation_type hv_get_isolation_type(void);
 bool hv_is_isolation_supported(void);
 void hyperv_cleanup(void);
+bool hv_query_ext_cap(u64 cap_query);
 #else /* CONFIG_HYPERV */
 static inline bool hv_is_hyperv_initialized(void) { return false; }
 static inline bool hv_is_hibernation_supported(void) { return false; }
 static inline void hyperv_cleanup(void) {}
 #endif /* CONFIG_HYPERV */
 
-#if IS_ENABLED(CONFIG_HYPERV)
-extern int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void));
-extern void hv_remove_stimer0_irq(int irq);
-#endif
-
 #endif
index 34eef08..b6774aa 100644 (file)
@@ -21,8 +21,7 @@
 #define HV_MIN_DELTA_TICKS 1
 
 /* Routines called by the VMbus driver */
-extern int hv_stimer_alloc(void);
-extern void hv_stimer_free(void);
+extern int hv_stimer_alloc(bool have_percpu_irqs);
 extern int hv_stimer_cleanup(unsigned int cpu);
 extern void hv_stimer_legacy_init(unsigned int cpu, int sint);
 extern void hv_stimer_legacy_cleanup(unsigned int cpu);
index f1d74dc..9c2373a 100644 (file)
@@ -234,6 +234,7 @@ static inline u32 hv_get_avail_to_write_percent(
  * 5 . 0  (Newer Windows 10)
  * 5 . 1  (Windows 10 RS4)
  * 5 . 2  (Windows Server 2019, RS5)
+ * 5 . 3  (Windows Server 2022)
  */
 
 #define VERSION_WS2008  ((0 << 16) | (13))
@@ -245,6 +246,7 @@ static inline u32 hv_get_avail_to_write_percent(
 #define VERSION_WIN10_V5 ((5 << 16) | (0))
 #define VERSION_WIN10_V5_1 ((5 << 16) | (1))
 #define VERSION_WIN10_V5_2 ((5 << 16) | (2))
+#define VERSION_WIN10_V5_3 ((5 << 16) | (3))
 
 /* Make maximum size of pipe payload of 16K */
 #define MAX_PIPE_DATA_PAYLOAD          (sizeof(u8) * 16384)
@@ -284,7 +286,7 @@ struct vmbus_channel_offer {
 
                /*
                 * Pipes:
-                * The following sructure is an integrated pipe protocol, which
+                * The following structure is an integrated pipe protocol, which
                 * is implemented on top of standard user-defined data. Pipe
                 * clients have MAX_PIPE_USER_DEFINED_BYTES left for their own
                 * use.
@@ -475,6 +477,7 @@ enum vmbus_channel_message_type {
        CHANNELMSG_TL_CONNECT_REQUEST           = 21,
        CHANNELMSG_MODIFYCHANNEL                = 22,
        CHANNELMSG_TL_CONNECT_RESULT            = 23,
+       CHANNELMSG_MODIFYCHANNEL_RESPONSE       = 24,
        CHANNELMSG_COUNT
 };
 
@@ -588,6 +591,13 @@ struct vmbus_channel_open_result {
        u32 status;
 } __packed;
 
+/* Modify Channel Result parameters */
+struct vmbus_channel_modifychannel_response {
+       struct vmbus_channel_message_header header;
+       u32 child_relid;
+       u32 status;
+} __packed;
+
 /* Close channel parameters; */
 struct vmbus_channel_close_channel {
        struct vmbus_channel_message_header header;
@@ -720,6 +730,7 @@ struct vmbus_channel_msginfo {
                struct vmbus_channel_gpadl_torndown gpadl_torndown;
                struct vmbus_channel_gpadl_created gpadl_created;
                struct vmbus_channel_version_response version_response;
+               struct vmbus_channel_modifychannel_response modify_response;
        } response;
 
        u32 msgsize;
@@ -883,11 +894,11 @@ struct vmbus_channel {
         * Support for sub-channels. For high performance devices,
         * it will be useful to have multiple sub-channels to support
         * a scalable communication infrastructure with the host.
-        * The support for sub-channels is implemented as an extention
+        * The support for sub-channels is implemented as an extension
         * to the current infrastructure.
         * The initial offer is considered the primary channel and this
         * offer message will indicate if the host supports sub-channels.
-        * The guest is free to ask for sub-channels to be offerred and can
+        * The guest is free to ask for sub-channels to be offered and can
         * open these sub-channels as a normal "primary" channel. However,
         * all sub-channels will have the same type and instance guids as the
         * primary channel. Requests sent on a given channel will result in a
@@ -951,7 +962,7 @@ struct vmbus_channel {
         * Clearly, these optimizations improve throughput at the expense of
         * latency. Furthermore, since the channel is shared for both
         * control and data messages, control messages currently suffer
-        * unnecessary latency adversley impacting performance and boot
+        * unnecessary latency adversely impacting performance and boot
         * time. To fix this issue, permit tagging the channel as being
         * in "low latency" mode. In this mode, we will bypass the monitor
         * mechanism.
@@ -1594,7 +1605,7 @@ extern __u32 vmbus_proto_version;
 
 int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
                                  const guid_t *shv_host_servie_id);
-int vmbus_send_modifychannel(u32 child_relid, u32 target_vp);
+int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp);
 void vmbus_set_event(struct vmbus_channel *channel);
 
 /* Get the start of the ring buffer. */