net/rds: Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition after posting IB_WR_...
authorGerd Rausch <gerd.rausch@oracle.com>
Tue, 16 Jul 2019 22:29:02 +0000 (15:29 -0700)
committerDavid S. Miller <davem@davemloft.net>
Wed, 17 Jul 2019 19:06:52 +0000 (12:06 -0700)
In order to:
1) avoid a silly bouncing between "clean_list" and "drop_list"
   triggered by function "rds_ib_reg_frmr" as it is releases frmr
   regions whose state is not "FRMR_IS_FREE" right away.

2) prevent an invalid access error in a race from a pending
   "IB_WR_LOCAL_INV" operation with a teardown ("dma_unmap_sg", "put_page")
   and de-registration ("ib_dereg_mr") of the corresponding
   memory region.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
net/rds/ib_frmr.c
net/rds/ib_mr.h

index 6038138..708c553 100644 (file)
@@ -76,6 +76,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
 
        frmr->fr_state = FRMR_IS_FREE;
        init_waitqueue_head(&frmr->fr_inv_done);
+       init_waitqueue_head(&frmr->fr_reg_done);
        return ibmr;
 
 out_no_cigar:
@@ -124,6 +125,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
         */
        ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
        frmr->fr_state = FRMR_IS_INUSE;
+       frmr->fr_reg = true;
 
        memset(&reg_wr, 0, sizeof(reg_wr));
        reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
@@ -144,7 +146,17 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
                if (printk_ratelimit())
                        pr_warn("RDS/IB: %s returned error(%d)\n",
                                __func__, ret);
+               goto out;
        }
+
+       /* Wait for the registration to complete in order to prevent an invalid
+        * access error resulting from a race between the memory region already
+        * being accessed while registration is still pending.
+        */
+       wait_event(frmr->fr_reg_done, !frmr->fr_reg);
+
+out:
+
        return ret;
 }
 
@@ -262,6 +274,19 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
                pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
                goto out;
        }
+
+       /* Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition in order to
+        * 1) avoid a silly bouncing between "clean_list" and "drop_list"
+        *    triggered by function "rds_ib_reg_frmr" as it is releases frmr
+        *    regions whose state is not "FRMR_IS_FREE" right away.
+        * 2) prevents an invalid access error in a race
+        *    from a pending "IB_WR_LOCAL_INV" operation
+        *    with a teardown ("dma_unmap_sg", "put_page")
+        *    and de-registration ("ib_dereg_mr") of the corresponding
+        *    memory region.
+        */
+       wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);
+
 out:
        return ret;
 }
@@ -289,6 +314,11 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
                wake_up(&frmr->fr_inv_done);
        }
 
+       if (frmr->fr_reg) {
+               frmr->fr_reg = false;
+               wake_up(&frmr->fr_reg_done);
+       }
+
        atomic_inc(&ic->i_fastreg_wrs);
 }
 
@@ -297,14 +327,18 @@ void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
 {
        struct rds_ib_mr *ibmr, *next;
        struct rds_ib_frmr *frmr;
-       int ret = 0;
+       int ret = 0, ret2;
        unsigned int freed = *nfreed;
 
        /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
        list_for_each_entry(ibmr, list, unmap_list) {
-               if (ibmr->sg_dma_len)
-                       ret |= rds_ib_post_inv(ibmr);
+               if (ibmr->sg_dma_len) {
+                       ret2 = rds_ib_post_inv(ibmr);
+                       if (ret2 && !ret)
+                               ret = ret2;
+               }
        }
+
        if (ret)
                pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret);
 
@@ -347,31 +381,8 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
        }
 
        do {
-               if (ibmr) {
-                       /* Memory regions make it onto the "clean_list" via
-                        * "rds_ib_flush_mr_pool", after the memory region has
-                        * been posted for invalidation via "rds_ib_post_inv".
-                        *
-                        * At that point in time, "fr_state" may still be
-                        * in state "FRMR_IS_INUSE", since the only place where
-                        * "fr_state" transitions to "FRMR_IS_FREE" is in
-                        * is in "rds_ib_mr_cqe_handler", which is
-                        * triggered by a tasklet.
-                        *
-                        * So we wait for "fr_inv_done" to trigger
-                        * and only put memory regions onto the drop_list
-                        * that failed (i.e. not marked "FRMR_IS_FREE").
-                        *
-                        * This avoids the problem of memory-regions bouncing
-                        * between "clean_list" and "drop_list" before they
-                        * even have a chance to be properly invalidated.
-                        */
-                       frmr = &ibmr->u.frmr;
-                       wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);
-                       if (frmr->fr_state == FRMR_IS_FREE)
-                               break;
+               if (ibmr)
                        rds_ib_free_frmr(ibmr, true);
-               }
                ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
                if (IS_ERR(ibmr))
                        return ibmr;
index ab26c20..9045a8c 100644 (file)
@@ -58,6 +58,8 @@ struct rds_ib_frmr {
        enum rds_ib_fr_state    fr_state;
        bool                    fr_inv;
        wait_queue_head_t       fr_inv_done;
+       bool                    fr_reg;
+       wait_queue_head_t       fr_reg_done;
        struct ib_send_wr       fr_wr;
        unsigned int            dma_npages;
        unsigned int            sg_byte_len;