PCI: hv: Prepare hv_compose_msi_msg() for the VMBus-channel-interrupt-to-vCPU reassig...
authorAndrea Parri (Microsoft) <parri.andrea@gmail.com>
Mon, 6 Apr 2020 00:15:10 +0000 (02:15 +0200)
committerWei Liu <wei.liu@kernel.org>
Thu, 23 Apr 2020 13:17:12 +0000 (13:17 +0000)
The current implementation of hv_compose_msi_msg() is incompatible with
the new functionality that allows changing the vCPU a VMBus channel will
interrupt: if this function always calls hv_pci_onchannelcallback() in
the polling loop, the interrupt going to a different CPU could cause
hv_pci_onchannelcallback() to be running simultaneously in a tasklet,
which will break.  The current code also has a problem in that it is not
synchronized with vmbus_reset_channel_cb(): hv_compose_msi_msg() could
be accessing the ring buffer via the call of hv_pci_onchannelcallback()
well after the time that vmbus_reset_channel_cb() has finished.

Fix these issues as follows.  Disable the channel tasklet before
entering the polling loop in hv_compose_msi_msg() and re-enable it when
done.  This will prevent hv_pci_onchannelcallback() from running in a
tasklet on a different CPU.  Moreover, poll by always calling
hv_pci_onchannelcallback(), but check the channel callback function for
NULL and invoke the callback within a sched_lock critical section.  This
will prevent hv_compose_msi_msg() from accessing the ring buffer after
vmbus_reset_channel_cb() has acquired the sched_lock spinlock.

Suggested-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Andrew Murray <amurray@thegoodpenguin.co.uk>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: <linux-pci@vger.kernel.org>
Link: https://lore.kernel.org/r/20200406001514.19876-8-parri.andrea@gmail.com
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
drivers/pci/controller/pci-hyperv.c

index e15022ff63e30bb23f4be74ff6da927d241a53c2..222ff5639ebe866f431373b43e53604feed63df5 100644 (file)
@@ -1356,11 +1356,11 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 {
        struct irq_cfg *cfg = irqd_cfg(data);
        struct hv_pcibus_device *hbus;
+       struct vmbus_channel *channel;
        struct hv_pci_dev *hpdev;
        struct pci_bus *pbus;
        struct pci_dev *pdev;
        struct cpumask *dest;
-       unsigned long flags;
        struct compose_comp_ctxt comp;
        struct tran_int_desc *int_desc;
        struct {
@@ -1378,6 +1378,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
        dest = irq_data_get_effective_affinity_mask(data);
        pbus = pdev->bus;
        hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
+       channel = hbus->hdev->channel;
        hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
        if (!hpdev)
                goto return_null_message;
@@ -1435,43 +1436,52 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
                goto free_int_desc;
        }
 
+       /*
+        * Prevents hv_pci_onchannelcallback() from running concurrently
+        * in the tasklet.
+        */
+       tasklet_disable(&channel->callback_event);
+
        /*
         * Since this function is called with IRQ locks held, can't
         * do normal wait for completion; instead poll.
         */
        while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
+               unsigned long flags;
+
                /* 0xFFFF means an invalid PCI VENDOR ID. */
                if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
                        dev_err_once(&hbus->hdev->device,
                                     "the device has gone\n");
-                       goto free_int_desc;
+                       goto enable_tasklet;
                }
 
                /*
-                * When the higher level interrupt code calls us with
-                * interrupt disabled, we must poll the channel by calling
-                * the channel callback directly when channel->target_cpu is
-                * the current CPU. When the higher level interrupt code
-                * calls us with interrupt enabled, let's add the
-                * local_irq_save()/restore() to avoid race:
-                * hv_pci_onchannelcallback() can also run in tasklet.
+                * Make sure that the ring buffer data structure doesn't get
+                * freed while we dereference the ring buffer pointer.  Test
+                * for the channel's onchannel_callback being NULL within a
+                * sched_lock critical section.  See also the inline comments
+                * in vmbus_reset_channel_cb().
                 */
-               local_irq_save(flags);
-
-               if (hbus->hdev->channel->target_cpu == smp_processor_id())
-                       hv_pci_onchannelcallback(hbus);
-
-               local_irq_restore(flags);
+               spin_lock_irqsave(&channel->sched_lock, flags);
+               if (unlikely(channel->onchannel_callback == NULL)) {
+                       spin_unlock_irqrestore(&channel->sched_lock, flags);
+                       goto enable_tasklet;
+               }
+               hv_pci_onchannelcallback(hbus);
+               spin_unlock_irqrestore(&channel->sched_lock, flags);
 
                if (hpdev->state == hv_pcichild_ejecting) {
                        dev_err_once(&hbus->hdev->device,
                                     "the device is being ejected\n");
-                       goto free_int_desc;
+                       goto enable_tasklet;
                }
 
                udelay(100);
        }
 
+       tasklet_enable(&channel->callback_event);
+
        if (comp.comp_pkt.completion_status < 0) {
                dev_err(&hbus->hdev->device,
                        "Request for interrupt failed: 0x%x",
@@ -1495,6 +1505,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
        put_pcichild(hpdev);
        return;
 
+enable_tasklet:
+       tasklet_enable(&channel->callback_event);
 free_int_desc:
        kfree(int_desc);
 drop_reference: