svcrdma: Revert "svcrdma: Reduce Receive doorbell rate"
authorChuck Lever <chuck.lever@oracle.com>
Thu, 11 Mar 2021 18:25:01 +0000 (13:25 -0500)
committerChuck Lever <chuck.lever@oracle.com>
Thu, 11 Mar 2021 20:26:07 +0000 (15:26 -0500)
I tested commit 43042b90cae1 ("svcrdma: Reduce Receive doorbell
rate") with mlx4 (IB) and software iWARP and didn't find any
issues. However, I recently got my hardware iWARP setup back on
line (FastLinQ) and it's crashing hard on this commit (confirmed
via bisect).

The failure mode is complex.
 - After a connection is established, the first Receive completes
   normally.
 - But the second and third Receives have garbage in their Receive
   buffers. The server responds with ERR_VERS as a result.
 - When the client tears down the connection to retry, a couple
   of posted Receives flush twice, and that corrupts the recv_ctxt
   free list.
 - __svc_rdma_free then faults or loops infinitely while destroying
   the xprt's recv_ctxts.

Since 43042b90cae1 ("svcrdma: Reduce Receive doorbell rate") does
not fix a bug but is a scalability enhancement, it's safe and
appropriate to revert it while working on a replacement.

Fixes: 43042b90cae1 ("svcrdma: Reduce Receive doorbell rate")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
include/linux/sunrpc/svc_rdma.h
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c

index 7c693b3..1e76ed6 100644 (file)
@@ -104,7 +104,6 @@ struct svcxprt_rdma {
 
        wait_queue_head_t    sc_send_wait;      /* SQ exhaustion waitlist */
        unsigned long        sc_flags;
-       u32                  sc_pending_recvs;
        struct list_head     sc_read_complete_q;
        struct work_struct   sc_work;
 
index 6d28f23..7d34290 100644 (file)
@@ -266,46 +266,33 @@ void svc_rdma_release_rqst(struct svc_rqst *rqstp)
                svc_rdma_recv_ctxt_put(rdma, ctxt);
 }
 
-static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
-                                  unsigned int wanted, bool temp)
+static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma,
+                               struct svc_rdma_recv_ctxt *ctxt)
 {
-       const struct ib_recv_wr *bad_wr = NULL;
-       struct svc_rdma_recv_ctxt *ctxt;
-       struct ib_recv_wr *recv_chain;
        int ret;
 
-       recv_chain = NULL;
-       while (wanted--) {
-               ctxt = svc_rdma_recv_ctxt_get(rdma);
-               if (!ctxt)
-                       break;
-
-               trace_svcrdma_post_recv(ctxt);
-               ctxt->rc_temp = temp;
-               ctxt->rc_recv_wr.next = recv_chain;
-               recv_chain = &ctxt->rc_recv_wr;
-               rdma->sc_pending_recvs++;
-       }
-       if (!recv_chain)
-               return false;
-
-       ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr);
+       trace_svcrdma_post_recv(ctxt);
+       ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL);
        if (ret)
                goto err_post;
-       return true;
+       return 0;
 
 err_post:
-       while (bad_wr) {
-               ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt,
-                                   rc_recv_wr);
-               bad_wr = bad_wr->next;
-               svc_rdma_recv_ctxt_put(rdma, ctxt);
-       }
-
        trace_svcrdma_rq_post_err(rdma, ret);
-       /* Since we're destroying the xprt, no need to reset
-        * sc_pending_recvs. */
-       return false;
+       svc_rdma_recv_ctxt_put(rdma, ctxt);
+       return ret;
+}
+
+static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
+{
+       struct svc_rdma_recv_ctxt *ctxt;
+
+       if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
+               return 0;
+       ctxt = svc_rdma_recv_ctxt_get(rdma);
+       if (!ctxt)
+               return -ENOMEM;
+       return __svc_rdma_post_recv(rdma, ctxt);
 }
 
 /**
@@ -316,7 +303,20 @@ err_post:
  */
 bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
 {
-       return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true);
+       struct svc_rdma_recv_ctxt *ctxt;
+       unsigned int i;
+       int ret;
+
+       for (i = 0; i < rdma->sc_max_requests; i++) {
+               ctxt = svc_rdma_recv_ctxt_get(rdma);
+               if (!ctxt)
+                       return false;
+               ctxt->rc_temp = true;
+               ret = __svc_rdma_post_recv(rdma, ctxt);
+               if (ret)
+                       return false;
+       }
+       return true;
 }
 
 /**
@@ -324,6 +324,8 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
  * @cq: Completion Queue context
  * @wc: Work Completion object
  *
+ * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that
+ * the Receive completion handler could be running.
  */
 static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 {
@@ -331,8 +333,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
        struct ib_cqe *cqe = wc->wr_cqe;
        struct svc_rdma_recv_ctxt *ctxt;
 
-       rdma->sc_pending_recvs--;
-
        /* WARNING: Only wc->wr_cqe and wc->status are reliable */
        ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
 
@@ -340,6 +340,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
        if (wc->status != IB_WC_SUCCESS)
                goto flushed;
 
+       if (svc_rdma_post_recv(rdma))
+               goto post_err;
+
        /* All wc fields are now known to be valid */
        ctxt->rc_byte_len = wc->byte_len;
 
@@ -350,18 +353,11 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
        spin_unlock(&rdma->sc_rq_dto_lock);
        if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
                svc_xprt_enqueue(&rdma->sc_xprt);
-
-       if (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) &&
-           rdma->sc_pending_recvs < rdma->sc_max_requests)
-               if (!svc_rdma_refresh_recvs(rdma, RPCRDMA_MAX_RECV_BATCH,
-                                           false))
-                       goto post_err;
-
        return;
 
 flushed:
-       svc_rdma_recv_ctxt_put(rdma, ctxt);
 post_err:
+       svc_rdma_recv_ctxt_put(rdma, ctxt);
        set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
        svc_xprt_enqueue(&rdma->sc_xprt);
 }