mtip32xx: Add workqueue and NUMA support
authorAsai Thambi S P <asamymuthupa@micron.com>
Thu, 20 Dec 2012 15:46:25 +0000 (07:46 -0800)
committerJens Axboe <axboe@kernel.dk>
Fri, 11 Jan 2013 13:38:57 +0000 (14:38 +0100)
This patch contains
* parallel command completion using workers
* bind the workers to the chosen numa node
* bind isr to the chosen numa node
* allocating memory in the chosen numa node

Signed-off-by: Asai Thambi S P <asamymuthupa@micron.com>
Signed-off-by: Sam Bradshaw <sbradshaw@micron.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
drivers/block/mtip32xx/mtip32xx.c
drivers/block/mtip32xx/mtip32xx.h

index 9694dd9..99c2cf4 100644 (file)
@@ -88,6 +88,8 @@ static int instance;
 static int mtip_major;
 static struct dentry *dfs_parent;
 
+static u32 cpu_use[NR_CPUS];
+
 static DEFINE_SPINLOCK(rssd_index_lock);
 static DEFINE_IDA(rssd_index_ida);
 
@@ -296,16 +298,17 @@ static int hba_reset_nosleep(struct driver_data *dd)
  */
 static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
 {
-       atomic_set(&port->commands[tag].active, 1);
+       int group = tag >> 5;
 
-       spin_lock(&port->cmd_issue_lock);
+       atomic_set(&port->commands[tag].active, 1);
 
+       /* guard SACT and CI registers */
+       spin_lock(&port->cmd_issue_lock[group]);
        writel((1 << MTIP_TAG_BIT(tag)),
                        port->s_active[MTIP_TAG_INDEX(tag)]);
        writel((1 << MTIP_TAG_BIT(tag)),
                        port->cmd_issue[MTIP_TAG_INDEX(tag)]);
-
-       spin_unlock(&port->cmd_issue_lock);
+       spin_unlock(&port->cmd_issue_lock[group]);
 
        /* Set the command's timeout value.*/
        port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
@@ -963,56 +966,56 @@ handle_tfe_exit:
 /*
  * Handle a set device bits interrupt
  */
-static inline void mtip_process_sdbf(struct driver_data *dd)
+static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
+                                                       u32 completed)
 {
-       struct mtip_port  *port = dd->port;
-       int group, tag, bit;
-       u32 completed;
+       struct driver_data *dd = port->dd;
+       int tag, bit;
        struct mtip_cmd *command;
 
-       /* walk all bits in all slot groups */
-       for (group = 0; group < dd->slot_groups; group++) {
-               completed = readl(port->completed[group]);
-               if (!completed)
-                       continue;
+       if (!completed) {
+               WARN_ON_ONCE(!completed);
+               return;
+       }
+       /* clear completed status register in the hardware.*/
+       writel(completed, port->completed[group]);
 
-               /* clear completed status register in the hardware.*/
-               writel(completed, port->completed[group]);
+       /* Process completed commands. */
+       for (bit = 0; (bit < 32) && completed; bit++) {
+               if (completed & 0x01) {
+                       tag = (group << 5) | bit;
 
-               /* Process completed commands. */
-               for (bit = 0;
-                    (bit < 32) && completed;
-                    bit++, completed >>= 1) {
-                       if (completed & 0x01) {
-                               tag = (group << 5) | bit;
+                       /* skip internal command slot. */
+                       if (unlikely(tag == MTIP_TAG_INTERNAL))
+                               continue;
 
-                               /* skip internal command slot. */
-                               if (unlikely(tag == MTIP_TAG_INTERNAL))
-                                       continue;
+                       command = &port->commands[tag];
+                       /* make internal callback */
+                       if (likely(command->comp_func)) {
+                               command->comp_func(
+                                       port,
+                                       tag,
+                                       command->comp_data,
+                                       0);
+                       } else {
+                               dev_warn(&dd->pdev->dev,
+                                       "Null completion "
+                                       "for tag %d",
+                                       tag);
 
-                               command = &port->commands[tag];
-                               /* make internal callback */
-                               if (likely(command->comp_func)) {
-                                       command->comp_func(
-                                               port,
-                                               tag,
-                                               command->comp_data,
-                                               0);
-                               } else {
-                                       dev_warn(&dd->pdev->dev,
-                                               "Null completion "
-                                               "for tag %d",
-                                               tag);
-
-                                       if (mtip_check_surprise_removal(
-                                               dd->pdev)) {
-                                               mtip_command_cleanup(dd);
-                                               return;
-                                       }
+                               if (mtip_check_surprise_removal(
+                                       dd->pdev)) {
+                                       mtip_command_cleanup(dd);
+                                       return;
                                }
                        }
                }
+               completed >>= 1;
        }
+
+       /* If last, re-enable interrupts */
+       if (atomic_dec_return(&dd->irq_workers_active) == 0)
+               writel(0xffffffff, dd->mmio + HOST_IRQ_STAT);
 }
 
 /*
@@ -1071,6 +1074,8 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
        struct mtip_port *port = dd->port;
        u32 hba_stat, port_stat;
        int rv = IRQ_NONE;
+       int do_irq_enable = 1, i, workers;
+       struct mtip_work *twork;
 
        hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
        if (hba_stat) {
@@ -1081,8 +1086,42 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
                writel(port_stat, port->mmio + PORT_IRQ_STAT);
 
                /* Demux port status */
-               if (likely(port_stat & PORT_IRQ_SDB_FIS))
-                       mtip_process_sdbf(dd);
+               if (likely(port_stat & PORT_IRQ_SDB_FIS)) {
+                       do_irq_enable = 0;
+                       WARN_ON_ONCE(atomic_read(&dd->irq_workers_active) != 0);
+
+                       /* Start at 1: group zero is always local? */
+                       for (i = 0, workers = 0; i < MTIP_MAX_SLOT_GROUPS;
+                                                                       i++) {
+                               twork = &dd->work[i];
+                               twork->completed = readl(port->completed[i]);
+                               if (twork->completed)
+                                       workers++;
+                       }
+
+                       atomic_set(&dd->irq_workers_active, workers);
+                       if (workers) {
+                               for (i = 1; i < MTIP_MAX_SLOT_GROUPS; i++) {
+                                       twork = &dd->work[i];
+                                       if (twork->completed)
+                                               queue_work_on(
+                                                       twork->cpu_binding,
+                                                       dd->isr_workq,
+                                                       &twork->work);
+                               }
+
+                               if (likely(dd->work[0].completed))
+                                       mtip_workq_sdbfx(port, 0,
+                                                       dd->work[0].completed);
+
+                       } else {
+                               /*
+                                * Chip quirk: SDB interrupt but nothing
+                                * to complete
+                                */
+                               do_irq_enable = 1;
+                       }
+               }
 
                if (unlikely(port_stat & PORT_IRQ_ERR)) {
                        if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
@@ -1102,21 +1141,13 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
        }
 
        /* acknowledge interrupt */
-       writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
+       if (unlikely(do_irq_enable))
+               writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
 
        return rv;
 }
 
 /*
- * Wrapper for mtip_handle_irq
- * (ignores return code)
- */
-static void mtip_tasklet(unsigned long data)
-{
-       mtip_handle_irq((struct driver_data *) data);
-}
-
-/*
  * HBA interrupt subroutine.
  *
  * @irq                IRQ number.
@@ -1129,8 +1160,8 @@ static void mtip_tasklet(unsigned long data)
 static irqreturn_t mtip_irq_handler(int irq, void *instance)
 {
        struct driver_data *dd = instance;
-       tasklet_schedule(&dd->tasklet);
-       return IRQ_HANDLED;
+
+       return mtip_handle_irq(dd);
 }
 
 static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
@@ -3004,20 +3035,24 @@ static int mtip_hw_init(struct driver_data *dd)
 
        hba_setup(dd);
 
-       tasklet_init(&dd->tasklet, mtip_tasklet, (unsigned long)dd);
-
-       dd->port = kzalloc(sizeof(struct mtip_port), GFP_KERNEL);
+       dd->port = kzalloc_node(sizeof(struct mtip_port), GFP_KERNEL,
+                               dd->numa_node);
        if (!dd->port) {
                dev_err(&dd->pdev->dev,
                        "Memory allocation: port structure\n");
                return -ENOMEM;
        }
 
+       /* Continue workqueue setup */
+       for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
+               dd->work[i].port = dd->port;
+
        /* Counting semaphore to track command slot usage */
        sema_init(&dd->port->cmd_slot, num_command_slots - 1);
 
        /* Spinlock to prevent concurrent issue */
-       spin_lock_init(&dd->port->cmd_issue_lock);
+       for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
+               spin_lock_init(&dd->port->cmd_issue_lock[i]);
 
        /* Set the port mmio base address. */
        dd->port->mmio  = dd->mmio + PORT_OFFSET;
@@ -3164,6 +3199,7 @@ static int mtip_hw_init(struct driver_data *dd)
                        "Unable to allocate IRQ %d\n", dd->pdev->irq);
                goto out2;
        }
+       irq_set_affinity_hint(dd->pdev->irq, get_cpu_mask(dd->isr_binding));
 
        /* Enable interrupts on the HBA. */
        writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
@@ -3240,7 +3276,8 @@ out3:
        writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
                        dd->mmio + HOST_CTL);
 
-       /*Release the IRQ. */
+       /* Release the IRQ. */
+       irq_set_affinity_hint(dd->pdev->irq, NULL);
        devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
 
 out2:
@@ -3290,11 +3327,9 @@ static int mtip_hw_exit(struct driver_data *dd)
        del_timer_sync(&dd->port->cmd_timer);
 
        /* Release the IRQ. */
+       irq_set_affinity_hint(dd->pdev->irq, NULL);
        devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
 
-       /* Stop the bottom half tasklet. */
-       tasklet_kill(&dd->tasklet);
-
        /* Free the command/command header memory. */
        dmam_free_coherent(&dd->pdev->dev,
                        HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
@@ -3710,7 +3745,7 @@ static int mtip_block_initialize(struct driver_data *dd)
                goto protocol_init_error;
        }
 
-       dd->disk = alloc_disk(MTIP_MAX_MINORS);
+       dd->disk = alloc_disk_node(MTIP_MAX_MINORS, dd->numa_node);
        if (dd->disk  == NULL) {
                dev_err(&dd->pdev->dev,
                        "Unable to allocate gendisk structure\n");
@@ -3754,7 +3789,7 @@ static int mtip_block_initialize(struct driver_data *dd)
 
 skip_create_disk:
        /* Allocate the request queue. */
-       dd->queue = blk_alloc_queue(GFP_KERNEL);
+       dd->queue = blk_alloc_queue_node(GFP_KERNEL, dd->numa_node);
        if (dd->queue == NULL) {
                dev_err(&dd->pdev->dev,
                        "Unable to allocate request queue\n");
@@ -3812,9 +3847,8 @@ skip_create_disk:
 
 start_service_thread:
        sprintf(thd_name, "mtip_svc_thd_%02d", index);
-
-       dd->mtip_svc_handler = kthread_run(mtip_service_thread,
-                                               dd, thd_name);
+       dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread,
+                                               dd, dd->numa_node, thd_name);
 
        if (IS_ERR(dd->mtip_svc_handler)) {
                dev_err(&dd->pdev->dev, "service thread failed to start\n");
@@ -3822,7 +3856,7 @@ start_service_thread:
                rv = -EFAULT;
                goto kthread_run_error;
        }
-
+       wake_up_process(dd->mtip_svc_handler);
        if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
                rv = wait_for_rebuild;
 
@@ -3951,6 +3985,56 @@ static int mtip_block_resume(struct driver_data *dd)
        return 0;
 }
 
+static void drop_cpu(int cpu)
+{
+       cpu_use[cpu]--;
+}
+
+static int get_least_used_cpu_on_node(int node)
+{
+       int cpu, least_used_cpu, least_cnt;
+       const struct cpumask *node_mask;
+
+       node_mask = cpumask_of_node(node);
+       least_used_cpu = cpumask_first(node_mask);
+       least_cnt = cpu_use[least_used_cpu];
+       cpu = least_used_cpu;
+
+       for_each_cpu(cpu, node_mask) {
+               if (cpu_use[cpu] < least_cnt) {
+                       least_used_cpu = cpu;
+                       least_cnt = cpu_use[cpu];
+               }
+       }
+       cpu_use[least_used_cpu]++;
+       return least_used_cpu;
+}
+
+/* Helper for selecting a node in round robin mode */
+static inline int mtip_get_next_rr_node(void)
+{
+       static int next_node = -1;
+
+       if (next_node == -1) {
+               next_node = first_online_node;
+               return next_node;
+       }
+
+       next_node = next_online_node(next_node);
+       if (next_node == MAX_NUMNODES)
+               next_node = first_online_node;
+       return next_node;
+}
+
+DEFINE_HANDLER(0);
+DEFINE_HANDLER(1);
+DEFINE_HANDLER(2);
+DEFINE_HANDLER(3);
+DEFINE_HANDLER(4);
+DEFINE_HANDLER(5);
+DEFINE_HANDLER(6);
+DEFINE_HANDLER(7);
+
 /*
  * Called for each supported PCI device detected.
  *
@@ -3965,9 +4049,25 @@ static int mtip_pci_probe(struct pci_dev *pdev,
 {
        int rv = 0;
        struct driver_data *dd = NULL;
+       char cpu_list[256];
+       const struct cpumask *node_mask;
+       int cpu, i = 0, j = 0;
+       int my_node = NUMA_NO_NODE;
 
        /* Allocate memory for this devices private data. */
-       dd = kzalloc(sizeof(struct driver_data), GFP_KERNEL);
+       my_node = pcibus_to_node(pdev->bus);
+       if (my_node != NUMA_NO_NODE) {
+               if (!node_online(my_node))
+                       my_node = mtip_get_next_rr_node();
+       } else {
+               dev_info(&pdev->dev, "Kernel not reporting proximity, choosing a node\n");
+               my_node = mtip_get_next_rr_node();
+       }
+       dev_info(&pdev->dev, "NUMA node %d (closest: %d,%d, probe on %d:%d)\n",
+               my_node, pcibus_to_node(pdev->bus), dev_to_node(&pdev->dev),
+               cpu_to_node(smp_processor_id()), smp_processor_id());
+
+       dd = kzalloc_node(sizeof(struct driver_data), GFP_KERNEL, my_node);
        if (dd == NULL) {
                dev_err(&pdev->dev,
                        "Unable to allocate memory for driver data\n");
@@ -4004,19 +4104,82 @@ static int mtip_pci_probe(struct pci_dev *pdev,
                }
        }
 
-       pci_set_master(pdev);
+       /* Copy the info we may need later into the private data structure. */
+       dd->major       = mtip_major;
+       dd->instance    = instance;
+       dd->pdev        = pdev;
+       dd->numa_node   = my_node;
+
+       memset(dd->workq_name, 0, 32);
+       snprintf(dd->workq_name, 31, "mtipq%d", dd->instance);
 
+       dd->isr_workq = create_workqueue(dd->workq_name);
+       if (!dd->isr_workq) {
+               dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance);
+               goto block_initialize_err;
+       }
+
+       memset(cpu_list, 0, sizeof(cpu_list));
+
+       node_mask = cpumask_of_node(dd->numa_node);
+       if (!cpumask_empty(node_mask)) {
+               for_each_cpu(cpu, node_mask)
+               {
+                       snprintf(&cpu_list[j], 256 - j, "%d ", cpu);
+                       j = strlen(cpu_list);
+               }
+
+               dev_info(&pdev->dev, "Node %d on package %d has %d cpu(s): %s\n",
+                       dd->numa_node,
+                       topology_physical_package_id(cpumask_first(node_mask)),
+                       nr_cpus_node(dd->numa_node),
+                       cpu_list);
+       } else
+               dev_dbg(&pdev->dev, "mtip32xx: node_mask empty\n");
+
+       dd->isr_binding = get_least_used_cpu_on_node(dd->numa_node);
+       dev_info(&pdev->dev, "Initial IRQ binding node:cpu %d:%d\n",
+               cpu_to_node(dd->isr_binding), dd->isr_binding);
+
+       /* first worker context always runs in ISR */
+       dd->work[0].cpu_binding = dd->isr_binding;
+       dd->work[1].cpu_binding = get_least_used_cpu_on_node(dd->numa_node);
+       dd->work[2].cpu_binding = get_least_used_cpu_on_node(dd->numa_node);
+       dd->work[3].cpu_binding = dd->work[0].cpu_binding;
+       dd->work[4].cpu_binding = dd->work[1].cpu_binding;
+       dd->work[5].cpu_binding = dd->work[2].cpu_binding;
+       dd->work[6].cpu_binding = dd->work[2].cpu_binding;
+       dd->work[7].cpu_binding = dd->work[1].cpu_binding;
+
+       /* Log the bindings */
+       for_each_present_cpu(cpu) {
+               memset(cpu_list, 0, sizeof(cpu_list));
+               for (i = 0, j = 0; i < MTIP_MAX_SLOT_GROUPS; i++) {
+                       if (dd->work[i].cpu_binding == cpu) {
+                               snprintf(&cpu_list[j], 256 - j, "%d ", i);
+                               j = strlen(cpu_list);
+                       }
+               }
+               if (j)
+                       dev_info(&pdev->dev, "CPU %d: WQs %s\n", cpu, cpu_list);
+       }
+
+       INIT_WORK(&dd->work[0].work, mtip_workq_sdbf0);
+       INIT_WORK(&dd->work[1].work, mtip_workq_sdbf1);
+       INIT_WORK(&dd->work[2].work, mtip_workq_sdbf2);
+       INIT_WORK(&dd->work[3].work, mtip_workq_sdbf3);
+       INIT_WORK(&dd->work[4].work, mtip_workq_sdbf4);
+       INIT_WORK(&dd->work[5].work, mtip_workq_sdbf5);
+       INIT_WORK(&dd->work[6].work, mtip_workq_sdbf6);
+       INIT_WORK(&dd->work[7].work, mtip_workq_sdbf7);
+
+       pci_set_master(pdev);
        if (pci_enable_msi(pdev)) {
                dev_warn(&pdev->dev,
                        "Unable to enable MSI interrupt.\n");
                goto block_initialize_err;
        }
 
-       /* Copy the info we may need later into the private data structure. */
-       dd->major       = mtip_major;
-       dd->instance    = instance;
-       dd->pdev        = pdev;
-
        /* Initialize the block layer. */
        rv = mtip_block_initialize(dd);
        if (rv < 0) {
@@ -4036,7 +4199,13 @@ static int mtip_pci_probe(struct pci_dev *pdev,
 
 block_initialize_err:
        pci_disable_msi(pdev);
-
+       if (dd->isr_workq) {
+               flush_workqueue(dd->isr_workq);
+               destroy_workqueue(dd->isr_workq);
+               drop_cpu(dd->work[0].cpu_binding);
+               drop_cpu(dd->work[1].cpu_binding);
+               drop_cpu(dd->work[2].cpu_binding);
+       }
 setmask_err:
        pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
 
@@ -4077,6 +4246,14 @@ static void mtip_pci_remove(struct pci_dev *pdev)
        /* Clean up the block layer. */
        mtip_block_remove(dd);
 
+       if (dd->isr_workq) {
+               flush_workqueue(dd->isr_workq);
+               destroy_workqueue(dd->isr_workq);
+               drop_cpu(dd->work[0].cpu_binding);
+               drop_cpu(dd->work[1].cpu_binding);
+               drop_cpu(dd->work[2].cpu_binding);
+       }
+
        pci_disable_msi(pdev);
 
        kfree(dd);
index b174264..d782b1a 100644 (file)
@@ -164,6 +164,20 @@ struct smart_attr {
        u8 res[3];
 } __packed;
 
+struct mtip_work {
+       struct work_struct work;
+       void *port;
+       int cpu_binding;
+       u32 completed;
+} ____cacheline_aligned_in_smp;
+
+#define DEFINE_HANDLER(group)                                  \
+       void mtip_workq_sdbf##group(struct work_struct *work)       \
+       {                                                      \
+               struct mtip_work *w = (struct mtip_work *) work;         \
+               mtip_workq_sdbfx(w->port, group, w->completed);     \
+       }
+
 /* Register Frame Information Structure (FIS), host to device. */
 struct host_to_dev_fis {
        /*
@@ -424,7 +438,7 @@ struct mtip_port {
         */
        struct semaphore cmd_slot;
        /* Spinlock for working around command-issue bug. */
-       spinlock_t cmd_issue_lock;
+       spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];
 };
 
 /*
@@ -447,9 +461,6 @@ struct driver_data {
 
        struct mtip_port *port; /* Pointer to the port data structure. */
 
-       /* Tasklet used to process the bottom half of the ISR. */
-       struct tasklet_struct tasklet;
-
        unsigned product_type; /* magic value declaring the product type */
 
        unsigned slot_groups; /* number of slot groups the product supports */
@@ -461,6 +472,18 @@ struct driver_data {
        struct task_struct *mtip_svc_handler; /* task_struct of svc thd */
 
        struct dentry *dfs_node;
+
+       int numa_node; /* NUMA support */
+
+       char workq_name[32];
+
+       struct workqueue_struct *isr_workq;
+
+       struct mtip_work work[MTIP_MAX_SLOT_GROUPS];
+
+       atomic_t irq_workers_active;
+
+       int isr_binding;
 };
 
 #endif