RDMA/umem: Combine contiguous PAGE_SIZE regions in SGEs
authorShiraz Saleem <shiraz.saleem@intel.com>
Tue, 2 Apr 2019 19:52:52 +0000 (14:52 -0500)
committerJason Gunthorpe <jgg@mellanox.com>
Mon, 8 Apr 2019 16:05:24 +0000 (13:05 -0300)
Combine contiguous regions of PAGE_SIZE pages into single scatter list
entry while building the scatter table for a umem. This minimizes the
number of the entries in the scatter list and reduces the DMA mapping
overhead, particularly with the IOMMU.

Set default max_seg_size in core for IB devices to 2G and do not combine
if we exceed this limit.

Also, purge npages in struct ib_umem as we now DMA map the umem SGL with
sg_nents and npage computation is not needed. Drivers should now be using
ib_umem_num_pages(), so fix the last stragglers.

Move npages tracking to ib_umem_odp as ODP drivers still need it.

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Acked-by: Adit Ranadive <aditr@vmware.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Tested-by: Gal Pressman <galpress@amazon.com>
Tested-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
drivers/infiniband/core/device.c
drivers/infiniband/core/umem.c
drivers/infiniband/core/umem_odp.c
drivers/infiniband/hw/mlx5/odp.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
include/rdma/ib_umem.h
include/rdma/ib_umem_odp.h

index 2dbd047..0f98da1 100644 (file)
@@ -1089,6 +1089,9 @@ static void setup_dma_device(struct ib_device *device)
                WARN_ON_ONCE(!parent);
                device->dma_device = parent;
        }
+       /* Setup default max segment size for all IB devices */
+       dma_set_max_seg_size(device->dma_device, SZ_2G);
+
 }
 
 /*
index 89a7d57..d31f5e3 100644 (file)
 #include <linux/export.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 #include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
 
-
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
-       struct scatterlist *sg;
+       struct sg_page_iter sg_iter;
        struct page *page;
-       int i;
 
        if (umem->nmap > 0)
-               ib_dma_unmap_sg(dev, umem->sg_head.sgl,
-                               umem->npages,
+               ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
                                DMA_BIDIRECTIONAL);
 
-       for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
-
-               page = sg_page(sg);
+       for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
+               page = sg_page_iter_page(&sg_iter);
                if (!PageDirty(page) && umem->writable && dirty)
                        set_page_dirty_lock(page);
                put_page(page);
@@ -66,6 +63,69 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
        sg_free_table(&umem->sg_head);
 }
 
+/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
+ *
+ * sg: current scatterlist entry
+ * page_list: array of npage struct page pointers
+ * npages: number of pages in page_list
+ * max_seg_sz: maximum segment size in bytes
+ * nents: [out] number of entries in the scatterlist
+ *
+ * Return new end of scatterlist
+ */
+static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
+                                               struct page **page_list,
+                                               unsigned long npages,
+                                               unsigned int max_seg_sz,
+                                               int *nents)
+{
+       unsigned long first_pfn;
+       unsigned long i = 0;
+       bool update_cur_sg = false;
+       bool first = !sg_page(sg);
+
+       /* Check if new page_list is contiguous with end of previous page_list.
+        * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
+        */
+       if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
+                      page_to_pfn(page_list[0])))
+               update_cur_sg = true;
+
+       while (i != npages) {
+               unsigned long len;
+               struct page *first_page = page_list[i];
+
+               first_pfn = page_to_pfn(first_page);
+
+               /* Compute the number of contiguous pages we have starting
+                * at i
+                */
+               for (len = 0; i != npages &&
+                             first_pfn + len == page_to_pfn(page_list[i]);
+                    len++)
+                       i++;
+
+               /* Squash N contiguous pages from page_list into current sge */
+               if (update_cur_sg &&
+                   ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT))) {
+                       sg_set_page(sg, sg_page(sg),
+                                   sg->length + (len << PAGE_SHIFT), 0);
+                       update_cur_sg = false;
+                       continue;
+               }
+
+               /* Squash N contiguous pages into next sge or first sge */
+               if (!first)
+                       sg = sg_next(sg);
+
+               (*nents)++;
+               sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
+               first = false;
+       }
+
+       return sg;
+}
+
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
  *
@@ -93,7 +153,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
        int ret;
        int i;
        unsigned long dma_attrs = 0;
-       struct scatterlist *sg, *sg_list_start;
+       struct scatterlist *sg;
        unsigned int gup_flags = FOLL_WRITE;
 
        if (!udata)
@@ -190,7 +250,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
        if (!umem->writable)
                gup_flags |= FOLL_FORCE;
 
-       sg_list_start = umem->sg_head.sgl;
+       sg = umem->sg_head.sgl;
 
        while (npages) {
                down_read(&mm->mmap_sem);
@@ -203,28 +263,29 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
                        goto umem_release;
                }
 
-               umem->npages += ret;
                cur_base += ret * PAGE_SIZE;
                npages   -= ret;
 
+               sg = ib_umem_add_sg_table(sg, page_list, ret,
+                       dma_get_max_seg_size(context->device->dma_device),
+                       &umem->sg_nents);
+
                /* Continue to hold the mmap_sem as vma_list access
                 * needs to be protected.
                 */
-               for_each_sg(sg_list_start, sg, ret, i) {
+               for (i = 0; i < ret && umem->hugetlb; i++) {
                        if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
                                umem->hugetlb = 0;
-
-                       sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
                }
-               up_read(&mm->mmap_sem);
 
-               /* preparing for next loop */
-               sg_list_start = sg;
+               up_read(&mm->mmap_sem);
        }
 
+       sg_mark_end(sg);
+
        umem->nmap = ib_dma_map_sg_attrs(context->device,
                                  umem->sg_head.sgl,
-                                 umem->npages,
+                                 umem->sg_nents,
                                  DMA_BIDIRECTIONAL,
                                  dma_attrs);
 
@@ -320,8 +381,8 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
                return -EINVAL;
        }
 
-       ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length,
-                                offset + ib_umem_offset(umem));
+       ret = sg_pcopy_to_buffer(umem->sg_head.sgl, ib_umem_num_pages(umem),
+                                dst, length, offset + ib_umem_offset(umem));
 
        if (ret < 0)
                return ret;
index 6f8c36f..9721914 100644 (file)
@@ -526,7 +526,7 @@ static int ib_umem_odp_map_dma_single_page(
                }
                umem_odp->dma_list[page_index] = dma_addr | access_mask;
                umem_odp->page_list[page_index] = page;
-               umem->npages++;
+               umem_odp->npages++;
        } else if (umem_odp->page_list[page_index] == page) {
                umem_odp->dma_list[page_index] |= access_mask;
        } else {
@@ -752,7 +752,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
                        }
                        umem_odp->page_list[idx] = NULL;
                        umem_odp->dma_list[idx] = 0;
-                       umem->npages--;
+                       umem_odp->npages--;
                }
        }
        mutex_unlock(&umem_odp->umem_mutex);
index cdb0d63..91507a2 100644 (file)
@@ -288,7 +288,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
 
        ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
 
-       if (unlikely(!umem->npages && mr->parent &&
+       if (unlikely(!umem_odp->npages && mr->parent &&
                     !umem_odp->dying)) {
                WRITE_ONCE(umem_odp->dying, 1);
                atomic_inc(&mr->parent->num_leaf_free);
index 9e6c44e..65dc47f 100644 (file)
@@ -119,7 +119,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        union pvrdma_cmd_resp rsp;
        struct pvrdma_cmd_create_mr *cmd = &req.create_mr;
        struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp;
-       int ret;
+       int ret, npages;
 
        if (length == 0 || length > dev->dsr->caps.max_mr_size) {
                dev_warn(&dev->pdev->dev, "invalid mem region length\n");
@@ -133,9 +133,10 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                return ERR_CAST(umem);
        }
 
-       if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
+       npages = ib_umem_num_pages(umem);
+       if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
                dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n",
-                        umem->npages);
+                        npages);
                ret = -EINVAL;
                goto err_umem;
        }
@@ -150,7 +151,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        mr->mmr.size = length;
        mr->umem = umem;
 
-       ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false);
+       ret = pvrdma_page_dir_init(dev, &mr->pdir, npages, false);
        if (ret) {
                dev_warn(&dev->pdev->dev,
                         "could not allocate page directory\n");
@@ -167,7 +168,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        cmd->length = length;
        cmd->pd_handle = to_vpd(pd)->pd_handle;
        cmd->access_flags = access_flags;
-       cmd->nchunks = umem->npages;
+       cmd->nchunks = npages;
        cmd->pdir_dma = mr->pdir.dir_dma;
 
        ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP);
index 73af05d..b13a2e9 100644 (file)
@@ -53,7 +53,7 @@ struct ib_umem {
        struct work_struct      work;
        struct sg_table sg_head;
        int             nmap;
-       int             npages;
+       unsigned int    sg_nents;
 };
 
 /* Returns the offset of the umem start relative to the first page. */
index dadc96d..eeec4e5 100644 (file)
@@ -69,6 +69,7 @@ struct ib_umem_odp {
 
        int notifiers_seq;
        int notifiers_count;
+       int npages;
 
        /* Tree tracking */
        struct umem_odp_node    interval_tree;