nvme-tcp: fix io_work priority inversion
authorKeith Busch <kbusch@kernel.org>
Thu, 9 Sep 2021 15:54:52 +0000 (08:54 -0700)
committerDom Cobley <popcornmix@gmail.com>
Thu, 14 Oct 2021 11:32:55 +0000 (12:32 +0100)
commit 70f437fb4395ad4d1d16fab9a1ad9fbc9fc0579b upstream.

Dispatching requests inline with the .queue_rq() call may block while
holding the send_mutex. If the tcp io_work also happens to schedule, it
may see the req_list is non-empty, leaving "pending" true and remaining
in TASK_RUNNING. Since io_work is of higher scheduling priority, the
.queue_rq task may not get a chance to run, blocking forward progress
and leading to io timeouts.

Instead of checking for pending requests within io_work, let the queueing
restart io_work outside the send_mutex lock if there is more work to be
done.

Fixes: a0fdd1418007f ("nvme-tcp: rerun io_work if req_list is not empty")
Reported-by: Samuel Jones <sjones@kalrayinc.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/nvme/host/tcp.c

index c9a9259..a6b3b07 100644 (file)
@@ -273,6 +273,12 @@ static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
        } while (ret > 0);
 }
 
+static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
+{
+       return !list_empty(&queue->send_list) ||
+               !llist_empty(&queue->req_list) || queue->more_requests;
+}
+
 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
                bool sync, bool last)
 {
@@ -293,9 +299,10 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
                nvme_tcp_send_all(queue);
                queue->more_requests = false;
                mutex_unlock(&queue->send_mutex);
-       } else if (last) {
-               queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
        }
+
+       if (last && nvme_tcp_queue_more(queue))
+               queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 }
 
 static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
@@ -890,12 +897,6 @@ done:
        read_unlock_bh(&sk->sk_callback_lock);
 }
 
-static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
-{
-       return !list_empty(&queue->send_list) ||
-               !llist_empty(&queue->req_list) || queue->more_requests;
-}
-
 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
 {
        queue->request = NULL;
@@ -1132,8 +1133,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
                                pending = true;
                        else if (unlikely(result < 0))
                                break;
-               } else
-                       pending = !llist_empty(&queue->req_list);
+               }
 
                result = nvme_tcp_try_recv(queue);
                if (result > 0)