module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
+static int queue_count_set(const char *val, const struct kernel_param *kp);
+static const struct kernel_param_ops queue_count_ops = {
+ .set = queue_count_set,
+ .get = param_get_int,
+};
+
+static int write_queues;
+module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
+MODULE_PARM_DESC(write_queues,
+ "Number of queues to use for writes. If not set, reads and writes "
+ "will share a queue set.");
+
struct nvme_dev;
struct nvme_queue;
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
+enum {
+ NVMEQ_TYPE_READ,
+ NVMEQ_TYPE_WRITE,
+ NVMEQ_TYPE_NR,
+};
+
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
+ unsigned io_queues[NVMEQ_TYPE_NR];
unsigned int num_vecs;
int q_depth;
u32 db_stride;
return param_set_int(val, kp);
}
+static int queue_count_set(const char *val, const struct kernel_param *kp)
+{
+ int n = 0, ret;
+
+ ret = kstrtoint(val, 10, &n);
+ if (n > num_possible_cpus())
+ n = num_possible_cpus();
+
+ return param_set_int(val, kp);
+}
+
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
{
return qid * 2 * stride;
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
}
+static unsigned int max_io_queues(void)
+{
+ return num_possible_cpus() + write_queues;
+}
+
+static unsigned int max_queue_count(void)
+{
+ /* IO queues + admin queue */
+ return 1 + max_io_queues();
+}
+
static inline unsigned int nvme_dbbuf_size(u32 stride)
{
- return ((num_possible_cpus() + 1) * 8 * stride);
+ return (max_queue_count() * 8 * stride);
}
static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
return 0;
}
+static int queue_irq_offset(struct nvme_dev *dev)
+{
+ /* if we have more than 1 vec, admin queue offsets us by 1 */
+ if (dev->num_vecs > 1)
+ return 1;
+
+ return 0;
+}
+
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
struct nvme_dev *dev = set->driver_data;
+ int i, qoff, offset;
+
+ offset = queue_irq_offset(dev);
+ for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+ struct blk_mq_queue_map *map = &set->map[i];
+
+ map->nr_queues = dev->io_queues[i];
+ if (!map->nr_queues) {
+ BUG_ON(i == NVMEQ_TYPE_READ);
- return blk_mq_pci_map_queues(&set->map[0], to_pci_dev(dev->dev),
- dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
+ /* shared set, resuse read set parameters */
+ map->nr_queues = dev->io_queues[NVMEQ_TYPE_READ];
+ qoff = 0;
+ offset = queue_irq_offset(dev);
+ }
+
+ map->queue_offset = qoff;
+ blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+ qoff += map->nr_queues;
+ offset += map->nr_queues;
+ }
+
+ return 0;
}
/**
return ret;
}
+static int nvme_rq_flags_to_type(struct request_queue *q, unsigned int flags)
+{
+ if ((flags & REQ_OP_MASK) == REQ_OP_READ)
+ return NVMEQ_TYPE_READ;
+
+ return NVMEQ_TYPE_WRITE;
+}
+
static void nvme_pci_complete_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
};
static const struct blk_mq_ops nvme_mq_ops = {
- .queue_rq = nvme_queue_rq,
- .complete = nvme_pci_complete_rq,
- .init_hctx = nvme_init_hctx,
- .init_request = nvme_init_request,
- .map_queues = nvme_pci_map_queues,
- .timeout = nvme_timeout,
- .poll = nvme_poll,
+ .queue_rq = nvme_queue_rq,
+ .rq_flags_to_type = nvme_rq_flags_to_type,
+ .complete = nvme_pci_complete_rq,
+ .init_hctx = nvme_init_hctx,
+ .init_request = nvme_init_request,
+ .map_queues = nvme_pci_map_queues,
+ .timeout = nvme_timeout,
+ .poll = nvme_poll,
};
static void nvme_dev_remove_admin(struct nvme_dev *dev)
return ret;
}
+static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
+{
+ unsigned int this_w_queues = write_queues;
+
+ /*
+ * Setup read/write queue split
+ */
+ if (nr_io_queues == 1) {
+ dev->io_queues[NVMEQ_TYPE_READ] = 1;
+ dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
+ return;
+ }
+
+ /*
+ * If 'write_queues' is set, ensure it leaves room for at least
+ * one read queue
+ */
+ if (this_w_queues >= nr_io_queues)
+ this_w_queues = nr_io_queues - 1;
+
+ /*
+ * If 'write_queues' is set to zero, reads and writes will share
+ * a queue set.
+ */
+ if (!this_w_queues) {
+ dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
+ dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues;
+ } else {
+ dev->io_queues[NVMEQ_TYPE_WRITE] = this_w_queues;
+ dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues - this_w_queues;
+ }
+}
+
+static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
+{
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
+ int irq_sets[2];
+ struct irq_affinity affd = {
+ .pre_vectors = 1,
+ .nr_sets = ARRAY_SIZE(irq_sets),
+ .sets = irq_sets,
+ };
+ int result;
+
+ /*
+ * For irq sets, we have to ask for minvec == maxvec. This passes
+ * any reduction back to us, so we can adjust our queue counts and
+ * IRQ vector needs.
+ */
+ do {
+ nvme_calc_io_queues(dev, nr_io_queues);
+ irq_sets[0] = dev->io_queues[NVMEQ_TYPE_READ];
+ irq_sets[1] = dev->io_queues[NVMEQ_TYPE_WRITE];
+ if (!irq_sets[1])
+ affd.nr_sets = 1;
+
+ /*
+ * Need IRQs for read+write queues, and one for the admin queue
+ */
+ nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
+
+ result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
+ nr_io_queues,
+ PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+
+ /*
+ * Need to reduce our vec counts
+ */
+ if (result == -ENOSPC) {
+ nr_io_queues--;
+ if (!nr_io_queues)
+ return result;
+ continue;
+ } else if (result <= 0)
+ return -EIO;
+ break;
+ } while (1);
+
+ return result;
+}
+
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct nvme_queue *adminq = &dev->queues[0];
int result, nr_io_queues;
unsigned long size;
- struct irq_affinity affd = {
- .pre_vectors = 1
- };
-
- nr_io_queues = num_possible_cpus();
+ nr_io_queues = max_io_queues();
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
* setting up the full range we need.
*/
pci_free_irq_vectors(pdev);
- result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1,
- PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+
+ result = nvme_setup_irqs(dev, nr_io_queues);
if (result <= 0)
return -EIO;
+
dev->num_vecs = result;
dev->max_qid = max(result - 1, 1);
+ dev_info(dev->ctrl.device, "%d/%d read/write queues\n",
+ dev->io_queues[NVMEQ_TYPE_READ],
+ dev->io_queues[NVMEQ_TYPE_WRITE]);
+
/*
* Should investigate if there's a performance win from allocating
* more queues than interrupt vectors; it might allow the submission
if (!dev->ctrl.tagset) {
dev->tagset.ops = &nvme_mq_ops;
dev->tagset.nr_hw_queues = dev->online_queues - 1;
+ dev->tagset.nr_maps = NVMEQ_TYPE_NR;
dev->tagset.timeout = NVME_IO_TIMEOUT;
dev->tagset.numa_node = dev_to_node(dev->dev);
dev->tagset.queue_depth =
if (!dev)
return -ENOMEM;
- dev->queues = kcalloc_node(num_possible_cpus() + 1,
- sizeof(struct nvme_queue), GFP_KERNEL, node);
+ dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
+ GFP_KERNEL, node);
if (!dev->queues)
goto free;