x86/apic/msi: Plug non-maskable MSI affinity race

author Thomas Gleixner <tglx@linutronix.de>

Fri, 31 Jan 2020 14:26:52 +0000 (15:26 +0100)

committer Thomas Gleixner <tglx@linutronix.de>

Sat, 1 Feb 2020 08:31:47 +0000 (09:31 +0100)
author Thomas Gleixner <tglx@linutronix.de>
Fri, 31 Jan 2020 14:26:52 +0000 (15:26 +0100)
committer Thomas Gleixner <tglx@linutronix.de>
Sat, 1 Feb 2020 08:31:47 +0000 (09:31 +0100)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h

index be0b9cf..19e94af 100644 (file)
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -454,6 +454,14 @@ static inline void ack_APIC_irq(void)
         apic_eoi();
  }
  
+
+static inline bool lapic_vector_set_in_irr(unsigned int vector)
+{
+       u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+
+       return !!(irr & (1U << (vector % 32)));
+}
+
  static inline unsigned default_get_apic_id(unsigned long x)
  {
         unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c

index 7f75334..159bd0c 100644 (file)
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -23,10 +23,8 @@
  
  static struct irq_domain *msi_default_domain;
  
-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
  {
-       struct irq_cfg *cfg = irqd_cfg(data);
-
         msg->address_hi = MSI_ADDR_BASE_HI;
  
         if (x2apic_enabled())
@@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
                 MSI_DATA_VECTOR(cfg->vector);
  }
  
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+       __irq_msi_compose_msg(irqd_cfg(data), msg);
+}
+
+static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
+{
+       struct msi_msg msg[2] = { [1] = { }, };
+
+       __irq_msi_compose_msg(cfg, msg);
+       irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+}
+
+static int
+msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
+{
+       struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
+       struct irq_data *parent = irqd->parent_data;
+       unsigned int cpu;
+       int ret;
+
+       /* Save the current configuration */
+       cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+       old_cfg = *cfg;
+
+       /* Allocate a new target vector */
+       ret = parent->chip->irq_set_affinity(parent, mask, force);
+       if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+               return ret;
+
+       /*
+        * For non-maskable and non-remapped MSI interrupts the migration
+        * to a different destination CPU and a different vector has to be
+        * done careful to handle the possible stray interrupt which can be
+        * caused by the non-atomic update of the address/data pair.
+        *
+        * Direct update is possible when:
+        * - The MSI is maskable (remapped MSI does not use this code path)).
+        *   The quirk bit is not set in this case.
+        * - The new vector is the same as the old vector
+        * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+        * - The new destination CPU is the same as the old destination CPU
+        */
+       if (!irqd_msi_nomask_quirk(irqd) ||
+           cfg->vector == old_cfg.vector ||
+           old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+           cfg->dest_apicid == old_cfg.dest_apicid) {
+               irq_msi_update_msg(irqd, cfg);
+               return ret;
+       }
+
+       /*
+        * Paranoia: Validate that the interrupt target is the local
+        * CPU.
+        */
+       if (WARN_ON_ONCE(cpu != smp_processor_id())) {
+               irq_msi_update_msg(irqd, cfg);
+               return ret;
+       }
+
+       /*
+        * Redirect the interrupt to the new vector on the current CPU
+        * first. This might cause a spurious interrupt on this vector if
+        * the device raises an interrupt right between this update and the
+        * update to the final destination CPU.
+        *
+        * If the vector is in use then the installed device handler will
+        * denote it as spurious which is no harm as this is a rare event
+        * and interrupt handlers have to cope with spurious interrupts
+        * anyway. If the vector is unused, then it is marked so it won't
+        * trigger the 'No irq handler for vector' warning in do_IRQ().
+        *
+        * This requires to hold vector lock to prevent concurrent updates to
+        * the affected vector.
+        */
+       lock_vector_lock();
+
+       /*
+        * Mark the new target vector on the local CPU if it is currently
+        * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
+        * the CPU hotplug path for a similar purpose. This cannot be
+        * undone here as the current CPU has interrupts disabled and
+        * cannot handle the interrupt before the whole set_affinity()
+        * section is done. In the CPU unplug case, the current CPU is
+        * about to vanish and will not handle any interrupts anymore. The
+        * vector is cleaned up when the CPU comes online again.
+        */
+       if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
+               this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
+
+       /* Redirect it to the new vector on the local CPU temporarily */
+       old_cfg.vector = cfg->vector;
+       irq_msi_update_msg(irqd, &old_cfg);
+
+       /* Now transition it to the target CPU */
+       irq_msi_update_msg(irqd, cfg);
+
+       /*
+        * All interrupts after this point are now targeted at the new
+        * vector/CPU.
+        *
+        * Drop vector lock before testing whether the temporary assignment
+        * to the local CPU was hit by an interrupt raised in the device,
+        * because the retrigger function acquires vector lock again.
+        */
+       unlock_vector_lock();
+
+       /*
+        * Check whether the transition raced with a device interrupt and
+        * is pending in the local APICs IRR. It is safe to do this outside
+        * of vector lock as the irq_desc::lock of this interrupt is still
+        * held and interrupts are disabled: The check is not accessing the
+        * underlying vector store. It's just checking the local APIC's
+        * IRR.
+        */
+       if (lapic_vector_set_in_irr(cfg->vector))
+               irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+
+       return ret;
+}
+
  /*
   * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
   * which implement the MSI or MSI-X Capability Structure.
@@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controller = {
         .irq_ack                = irq_chip_ack_parent,
         .irq_retrigger          = irq_chip_retrigger_hierarchy,
         .irq_compose_msi_msg    = irq_msi_compose_msg,
+       .irq_set_affinity       = msi_set_affinity,
         .flags                  = IRQCHIP_SKIP_SET_WAKE,
  };
  
@@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct irq_domain *parent)
         }
         if (!msi_default_domain)
                 pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+       else
+               msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
  }
  
  #ifdef CONFIG_IRQ_REMAP
diff --git a/include/linux/irq.h b/include/linux/irq.h

index 7853eb9..3ed5a05 100644 (file)
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -209,6 +209,8 @@ struct irq_data {
   * IRQD_SINGLE_TARGET          - IRQ allows only a single affinity target
   * IRQD_DEFAULT_TRIGGER_SET    - Expected trigger already been set
   * IRQD_CAN_RESERVE            - Can use reservation mode
+ * IRQD_MSI_NOMASK_QUIRK       - Non-maskable MSI quirk for affinity change
+ *                               required
   */
  enum {
         IRQD_TRIGGER_MASK               = 0xf,
@@ -231,6 +233,7 @@ enum {
         IRQD_SINGLE_TARGET              = (1 << 24),
         IRQD_DEFAULT_TRIGGER_SET        = (1 << 25),
         IRQD_CAN_RESERVE                = (1 << 26),
+       IRQD_MSI_NOMASK_QUIRK           = (1 << 27),
  };
  
  #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -390,6 +393,21 @@ static inline bool irqd_can_reserve(struct irq_data *d)
         return __irqd_to_state(d) & IRQD_CAN_RESERVE;
  }
  
+static inline void irqd_set_msi_nomask_quirk(struct irq_data *d)
+{
+       __irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK;
+}
+
+static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d)
+{
+       __irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK;
+}
+
+static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
+{
+       return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
+}
+
  #undef __irqd_to_state
  
  static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h

index 3c340db..4da8df5 100644 (file)
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -207,6 +207,13 @@ enum {
         IRQ_DOMAIN_FLAG_MSI_REMAP       = (1 << 5),
  
         /*
+        * Quirk to handle MSI implementations which do not provide
+        * masking. Currently known to affect x86, but partially
+        * handled in core code.
+        */
+       IRQ_DOMAIN_MSI_NOMASK_QUIRK     = (1 << 6),
+
+       /*
          * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
          * for implementation specific purposes and ignored by the
          * core code.
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c

index c1eccd4..a949bd3 100644 (file)
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdata_states[] = {
         BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
         BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
         BIT_MASK_DESCR(IRQD_CAN_RESERVE),
+       BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
  
         BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
  
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c

index ad26fbc..eb95f61 100644 (file)
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
                         continue;
  
                 irq_data = irq_domain_get_irq_data(domain, desc->irq);
-               if (!can_reserve)
+               if (!can_reserve) {
                         irqd_clr_can_reserve(irq_data);
+                       if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+                               irqd_set_msi_nomask_quirk(irq_data);
+               }
                 ret = irq_domain_activate_irq(irq_data, can_reserve);
                 if (ret)
                         goto cleanup;
author	Thomas Gleixner <tglx@linutronix.de>
	Fri, 31 Jan 2020 14:26:52 +0000 (15:26 +0100)
committer	Thomas Gleixner <tglx@linutronix.de>
	Sat, 1 Feb 2020 08:31:47 +0000 (09:31 +0100)
arch/x86/include/asm/apic.h		patch \| blob \| history
arch/x86/kernel/apic/msi.c		patch \| blob \| history
include/linux/irq.h		patch \| blob \| history
include/linux/irqdomain.h		patch \| blob \| history
kernel/irq/debugfs.c		patch \| blob \| history
kernel/irq/msi.c		patch \| blob \| history