RDMA/rxe: Replace mr by rkey in responder resources
authorBob Pearson <rpearsonhpe@gmail.com>
Fri, 4 Mar 2022 00:07:58 +0000 (18:07 -0600)
committerJason Gunthorpe <jgg@nvidia.com>
Tue, 15 Mar 2022 23:49:56 +0000 (20:49 -0300)
Currently rxe saves a copy of MR in responder resources for RDMA reads.
Since the responder resources are never freed just over written if more
are needed this MR may not have a reference freed until the QP is
destroyed. This patch uses the rkey instead of the MR and on subsequent
packets of a multipacket read reply message it looks up the MR from the
rkey for each packet. This makes it possible for a user to deregister an
MR or unbind a MW on the fly and get correct behaviour.

Link: https://lore.kernel.org/r/20220304000808.225811-3-rpearsonhpe@gmail.com
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
drivers/infiniband/sw/rxe/rxe_qp.c
drivers/infiniband/sw/rxe/rxe_resp.c
drivers/infiniband/sw/rxe/rxe_verbs.h

index 5f270cb..26d461a 100644 (file)
@@ -135,12 +135,8 @@ static void free_rd_atomic_resources(struct rxe_qp *qp)
 
 void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res)
 {
-       if (res->type == RXE_ATOMIC_MASK) {
+       if (res->type == RXE_ATOMIC_MASK)
                kfree_skb(res->atomic.skb);
-       } else if (res->type == RXE_READ_MASK) {
-               if (res->read.mr)
-                       rxe_drop_ref(res->read.mr);
-       }
        res->type = 0;
 }
 
@@ -825,10 +821,8 @@ static void rxe_qp_do_cleanup(struct work_struct *work)
        if (qp->pd)
                rxe_drop_ref(qp->pd);
 
-       if (qp->resp.mr) {
+       if (qp->resp.mr)
                rxe_drop_ref(qp->resp.mr);
-               qp->resp.mr = NULL;
-       }
 
        if (qp_type(qp) == IB_QPT_RC)
                sk_dst_reset(qp->sk->sk);
index b5ebe85..b1ec003 100644 (file)
@@ -642,6 +642,78 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
        return skb;
 }
 
+static struct resp_res *rxe_prepare_read_res(struct rxe_qp *qp,
+                                       struct rxe_pkt_info *pkt)
+{
+       struct resp_res *res;
+       u32 pkts;
+
+       res = &qp->resp.resources[qp->resp.res_head];
+       rxe_advance_resp_resource(qp);
+       free_rd_atomic_resource(qp, res);
+
+       res->type = RXE_READ_MASK;
+       res->replay = 0;
+       res->read.va = qp->resp.va + qp->resp.offset;
+       res->read.va_org = qp->resp.va + qp->resp.offset;
+       res->read.resid = qp->resp.resid;
+       res->read.length = qp->resp.resid;
+       res->read.rkey = qp->resp.rkey;
+
+       pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1);
+       res->first_psn = pkt->psn;
+       res->cur_psn = pkt->psn;
+       res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK;
+
+       res->state = rdatm_res_state_new;
+
+       return res;
+}
+
+/**
+ * rxe_recheck_mr - revalidate MR from rkey and get a reference
+ * @qp: the qp
+ * @rkey: the rkey
+ *
+ * This code allows the MR to be invalidated or deregistered or
+ * the MW if one was used to be invalidated or deallocated.
+ * It is assumed that the access permissions if originally good
+ * are OK and the mappings to be unchanged.
+ *
+ * Return: mr on success else NULL
+ */
+static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       struct rxe_mr *mr;
+       struct rxe_mw *mw;
+
+       if (rkey_is_mw(rkey)) {
+               mw = rxe_pool_get_index(&rxe->mw_pool, rkey >> 8);
+               if (!mw || mw->rkey != rkey)
+                       return NULL;
+
+               if (mw->state != RXE_MW_STATE_VALID) {
+                       rxe_drop_ref(mw);
+                       return NULL;
+               }
+
+               mr = mw->mr;
+               rxe_drop_ref(mw);
+       } else {
+               mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8);
+               if (!mr || mr->rkey != rkey)
+                       return NULL;
+       }
+
+       if (mr->state != RXE_MR_STATE_VALID) {
+               rxe_drop_ref(mr);
+               return NULL;
+       }
+
+       return mr;
+}
+
 /* RDMA read response. If res is not NULL, then we have a current RDMA request
  * being processed or replayed.
  */
@@ -656,53 +728,26 @@ static enum resp_states read_reply(struct rxe_qp *qp,
        int opcode;
        int err;
        struct resp_res *res = qp->resp.res;
+       struct rxe_mr *mr;
 
        if (!res) {
-               /* This is the first time we process that request. Get a
-                * resource
-                */
-               res = &qp->resp.resources[qp->resp.res_head];
-
-               free_rd_atomic_resource(qp, res);
-               rxe_advance_resp_resource(qp);
-
-               res->type               = RXE_READ_MASK;
-               res->replay             = 0;
-
-               res->read.va            = qp->resp.va +
-                                         qp->resp.offset;
-               res->read.va_org        = qp->resp.va +
-                                         qp->resp.offset;
-
-               res->first_psn          = req_pkt->psn;
-
-               if (reth_len(req_pkt)) {
-                       res->last_psn   = (req_pkt->psn +
-                                          (reth_len(req_pkt) + mtu - 1) /
-                                          mtu - 1) & BTH_PSN_MASK;
-               } else {
-                       res->last_psn   = res->first_psn;
-               }
-               res->cur_psn            = req_pkt->psn;
-
-               res->read.resid         = qp->resp.resid;
-               res->read.length        = qp->resp.resid;
-               res->read.rkey          = qp->resp.rkey;
-
-               /* note res inherits the reference to mr from qp */
-               res->read.mr            = qp->resp.mr;
-               qp->resp.mr             = NULL;
-
-               qp->resp.res            = res;
-               res->state              = rdatm_res_state_new;
+               res = rxe_prepare_read_res(qp, req_pkt);
+               qp->resp.res = res;
        }
 
        if (res->state == rdatm_res_state_new) {
+               mr = qp->resp.mr;
+               qp->resp.mr = NULL;
+
                if (res->read.resid <= mtu)
                        opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;
                else
                        opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST;
        } else {
+               mr = rxe_recheck_mr(qp, res->read.rkey);
+               if (!mr)
+                       return RESPST_ERR_RKEY_VIOLATION;
+
                if (res->read.resid > mtu)
                        opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE;
                else
@@ -718,10 +763,12 @@ static enum resp_states read_reply(struct rxe_qp *qp,
        if (!skb)
                return RESPST_ERR_RNR;
 
-       err = rxe_mr_copy(res->read.mr, res->read.va, payload_addr(&ack_pkt),
+       err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt),
                          payload, RXE_FROM_MR_OBJ);
        if (err)
                pr_err("Failed copying memory\n");
+       if (mr)
+               rxe_drop_ref(mr);
 
        if (bth_pad(&ack_pkt)) {
                u8 *pad = payload_addr(&ack_pkt) + payload;
index 6b15251..e7eff1c 100644 (file)
@@ -157,7 +157,6 @@ struct resp_res {
                        struct sk_buff  *skb;
                } atomic;
                struct {
-                       struct rxe_mr   *mr;
                        u64             va_org;
                        u32             rkey;
                        u32             length;