nvme-tcp: optimize queue io_cpu assignment for multiple queue maps
authorSagi Grimberg <sagi@grimberg.me>
Tue, 25 Feb 2020 23:53:09 +0000 (15:53 -0800)
committerKeith Busch <kbusch@kernel.org>
Wed, 25 Mar 2020 19:48:06 +0000 (04:48 +0900)
Currently, queue io_cpu assignment is done sequentially for default,
read and poll queues based on queue id. This causes miss-alignment between
context of CPU initiating I/O and the I/O worker thread processing
queued requests or completions.

Change to modify queue io_cpu assignment to take into account queue
maps offset. Each queue io_cpu will start at zero for each queue map.
This essentially aligns read/poll queues to start over the same range as
default queues.

Testing performed by Mark with:
- ram device (nvmet)
- single CPU core (pinned)
- 100% 4k reads
- engine io_uring (not using sq_thread option)
- hipri flag set

Micro-benchmark results show a net gain of:
- increase of 18%-29% in IOPs
- reduction of 16%-22% in average latency
- reduction of 7%-23% in 99.99% latency

Baseline:
========
QDepth/Batch | IOPs [k] | Avg. Lat [us] | 99.99% Lat [us]
-----------------------------------------------------------------
1/1  | 32.4 | 30.11 | 50.94
32/8 | 179 | 168.20 | 371

CPU alignment:
=============
QDepth/Batch | IOPs [k] | Avg. Lat [us] | 99.99% Lat [us]
-----------------------------------------------------------------
1/1  | 38.5 |   25.18 | 39.16
32/8 | 231 |   130.75 | 343

Reported-by: Mark Wunderlich <mark.wunderlich@intel.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
drivers/nvme/host/tcp.c

index e384239..11a7c26 100644 (file)
@@ -1258,13 +1258,67 @@ free_icreq:
        return ret;
 }
 
+static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
+{
+       return nvme_tcp_queue_id(queue) == 0;
+}
+
+static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
+{
+       struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+       int qid = nvme_tcp_queue_id(queue);
+
+       return !nvme_tcp_admin_queue(queue) &&
+               qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
+}
+
+static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
+{
+       struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+       int qid = nvme_tcp_queue_id(queue);
+
+       return !nvme_tcp_admin_queue(queue) &&
+               !nvme_tcp_default_queue(queue) &&
+               qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+                         ctrl->io_queues[HCTX_TYPE_READ];
+}
+
+static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
+{
+       struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+       int qid = nvme_tcp_queue_id(queue);
+
+       return !nvme_tcp_admin_queue(queue) &&
+               !nvme_tcp_default_queue(queue) &&
+               !nvme_tcp_read_queue(queue) &&
+               qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+                         ctrl->io_queues[HCTX_TYPE_READ] +
+                         ctrl->io_queues[HCTX_TYPE_POLL];
+}
+
+static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
+{
+       struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+       int qid = nvme_tcp_queue_id(queue);
+       int n = 0;
+
+       if (nvme_tcp_default_queue(queue))
+               n = qid - 1;
+       else if (nvme_tcp_read_queue(queue))
+               n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
+       else if (nvme_tcp_poll_queue(queue))
+               n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
+                               ctrl->io_queues[HCTX_TYPE_READ] - 1;
+       queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+}
+
 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
                int qid, size_t queue_size)
 {
        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
        struct linger sol = { .l_onoff = 1, .l_linger = 0 };
-       int ret, opt, rcv_pdu_size, n;
+       int ret, opt, rcv_pdu_size;
 
        queue->ctrl = ctrl;
        INIT_LIST_HEAD(&queue->send_list);
@@ -1343,11 +1397,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
        }
 
        queue->sock->sk->sk_allocation = GFP_ATOMIC;
-       if (!qid)
-               n = 0;
-       else
-               n = (qid - 1) % num_online_cpus();
-       queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+       nvme_tcp_set_queue_io_cpu(queue);
        queue->request = NULL;
        queue->data_remaining = 0;
        queue->ddgst_remaining = 0;