From b9584517832858a0f78d6851d09b697a829514cd Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 26 Jan 2023 00:28:04 +0200 Subject: [PATCH] RDMA/mlx5: Change the cache structure to an RB-tree Currently, the cache structure is a static linear array. Therefore, his size is limited to the number of entries in it and is not expandable. The entries are dedicated to mkeys of size 2^x and no access_flags. Mkeys with different properties are not cacheable. In this patch, we change the cache structure to an RB-tree. This will allow to extend the cache to support more entries with different mkey properties. Link: https://lore.kernel.org/r/20230125222807.6921-4-michaelgur@nvidia.com Signed-off-by: Michael Guralnik Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 11 ++- drivers/infiniband/hw/mlx5/mr.c | 160 ++++++++++++++++++++++++++--------- drivers/infiniband/hw/mlx5/odp.c | 8 +- 3 files changed, 132 insertions(+), 47 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 8d985f7..eec16db 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -741,6 +741,8 @@ struct mlx5_cache_ent { u32 access_mode; unsigned int ndescs; + struct rb_node node; + u8 disabled:1; u8 fill_to_high_water:1; @@ -770,8 +772,9 @@ struct mlx5r_async_create_mkey { struct mlx5_mkey_cache { struct workqueue_struct *wq; - struct mlx5_cache_ent ent[MAX_MKEY_CACHE_ENTRIES]; - struct dentry *root; + struct rb_root rb_root; + struct mutex rb_lock; + struct dentry *fs_root; unsigned long last_add; }; @@ -1316,11 +1319,15 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev); int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev); +struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, + int order); struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent, int access_flags); +struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev, u32 order, + int access_flags); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 356c99d..5cc618d 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -515,18 +515,22 @@ static const struct file_operations limit_fops = { static bool someone_adding(struct mlx5_mkey_cache *cache) { - unsigned int i; - - for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { - struct mlx5_cache_ent *ent = &cache->ent[i]; - bool ret; + struct mlx5_cache_ent *ent; + struct rb_node *node; + bool ret; + mutex_lock(&cache->rb_lock); + for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) { + ent = rb_entry(node, struct mlx5_cache_ent, node); xa_lock_irq(&ent->mkeys); ret = ent->stored < ent->limit; xa_unlock_irq(&ent->mkeys); - if (ret) + if (ret) { + mutex_unlock(&cache->rb_lock); return true; + } } + mutex_unlock(&cache->rb_lock); return false; } @@ -637,6 +641,59 @@ static void delayed_cache_work_func(struct work_struct *work) __cache_work_func(ent); } +static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, + struct mlx5_cache_ent *ent) +{ + struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL; + struct mlx5_cache_ent *cur; + + mutex_lock(&cache->rb_lock); + /* Figure out where to put new node */ + while (*new) { + cur = rb_entry(*new, struct mlx5_cache_ent, node); + parent = *new; + if (ent->order < cur->order) + new = &((*new)->rb_left); + if (ent->order > cur->order) + new = &((*new)->rb_right); + if (ent->order == cur->order) { + mutex_unlock(&cache->rb_lock); + return -EEXIST; + } + } + + /* Add new node and rebalance tree. */ + rb_link_node(&ent->node, parent, new); + rb_insert_color(&ent->node, &cache->rb_root); + + mutex_unlock(&cache->rb_lock); + return 0; +} + +static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev, + unsigned int order) +{ + struct rb_node *node = dev->cache.rb_root.rb_node; + struct mlx5_cache_ent *cur, *smallest = NULL; + + /* + * Find the smallest ent with order >= requested_order. + */ + while (node) { + cur = rb_entry(node, struct mlx5_cache_ent, node); + if (cur->order > order) { + smallest = cur; + node = node->rb_left; + } + if (cur->order < order) + node = node->rb_right; + if (cur->order == order) + return cur; + } + + return smallest; +} + struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent, int access_flags) @@ -677,10 +734,16 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, return mr; } -static void clean_keys(struct mlx5_ib_dev *dev, int c) +struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev, + u32 order, int access_flags) +{ + struct mlx5_cache_ent *ent = mkey_cache_ent_from_order(dev, order); + + return mlx5_mr_cache_alloc(dev, ent, access_flags); +} + +static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) { - struct mlx5_mkey_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent = &cache->ent[c]; u32 mkey; cancel_delayed_work(&ent->dwork); @@ -699,8 +762,8 @@ static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) if (!mlx5_debugfs_root || dev->is_rep) return; - debugfs_remove_recursive(dev->cache.root); - dev->cache.root = NULL; + debugfs_remove_recursive(dev->cache.fs_root); + dev->cache.fs_root = NULL; } static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) @@ -713,12 +776,13 @@ static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) if (!mlx5_debugfs_root || dev->is_rep) return; - cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); + dir = mlx5_debugfs_get_dev_root(dev->mdev); + cache->fs_root = debugfs_create_dir("mr_cache", dir); for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { - ent = &cache->ent[i]; + ent = mkey_cache_ent_from_order(dev, i); sprintf(ent->name, "%d", ent->order); - dir = debugfs_create_dir(ent->name, cache->root); + dir = debugfs_create_dir(ent->name, cache->fs_root); debugfs_create_file("size", 0600, dir, ent, &size_fops); debugfs_create_file("limit", 0600, dir, ent, &limit_fops); debugfs_create_ulong("cur", 0400, dir, &ent->stored); @@ -733,6 +797,30 @@ static void delay_time_func(struct timer_list *t) WRITE_ONCE(dev->fill_delay, 0); } +struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev, + int order) +{ + struct mlx5_cache_ent *ent; + int ret; + + ent = kzalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return ERR_PTR(-ENOMEM); + + xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); + ent->order = order; + ent->dev = dev; + + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + + ret = mlx5_cache_ent_insert(&dev->cache, ent); + if (ret) { + kfree(ent); + return ERR_PTR(ret); + } + return ent; +} + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mkey_cache *cache = &dev->cache; @@ -740,6 +828,8 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) int i; mutex_init(&dev->slow_path_mutex); + mutex_init(&dev->cache.rb_lock); + dev->cache.rb_root = RB_ROOT; cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); if (!cache->wq) { mlx5_ib_warn(dev, "failed to create work queue\n"); @@ -749,13 +839,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); timer_setup(&dev->delay_timer, delay_time_func, 0); for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { - ent = &cache->ent[i]; - xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); - ent->order = i + 2; - ent->dev = dev; - ent->limit = 0; - - INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + ent = mlx5r_cache_create_ent(dev, i); if (i > MKEY_CACHE_LAST_STD_ENTRY) { mlx5_odp_init_mkey_cache_entry(ent); @@ -785,14 +869,16 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) { - unsigned int i; + struct rb_root *root = &dev->cache.rb_root; + struct mlx5_cache_ent *ent; + struct rb_node *node; if (!dev->cache.wq) return 0; - for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { - struct mlx5_cache_ent *ent = &dev->cache.ent[i]; - + mutex_lock(&dev->cache.rb_lock); + for (node = rb_first(root); node; node = rb_next(node)) { + ent = rb_entry(node, struct mlx5_cache_ent, node); xa_lock_irq(&ent->mkeys); ent->disabled = true; xa_unlock_irq(&ent->mkeys); @@ -802,8 +888,15 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) mlx5_mkey_cache_debugfs_cleanup(dev); mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); - for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) - clean_keys(dev, i); + node = rb_first(root); + while (node) { + ent = rb_entry(node, struct mlx5_cache_ent, node); + node = rb_next(node); + clean_keys(dev, ent); + rb_erase(&ent->node, root); + kfree(ent); + } + mutex_unlock(&dev->cache.rb_lock); destroy_workqueue(dev->cache.wq); del_timer_sync(&dev->delay_timer); @@ -876,19 +969,6 @@ static int mkey_cache_max_order(struct mlx5_ib_dev *dev) return MLX5_MAX_UMR_SHIFT; } -static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev, - unsigned int order) -{ - struct mlx5_mkey_cache *cache = &dev->cache; - - if (order < cache->ent[0].order) - return &cache->ent[0]; - order = order - cache->ent[0].order; - if (order > MKEY_CACHE_LAST_STD_ENTRY) - return NULL; - return &cache->ent[order]; -} - static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, u64 length, int access_flags, u64 iova) { diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 72044f8..71c3c61 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -419,8 +419,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, return ERR_CAST(odp); BUILD_BUG_ON(order > MKEY_CACHE_LAST_STD_ENTRY); - mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[order], - imr->access_flags); + mr = mlx5_mr_cache_alloc_order(dev, order, imr->access_flags); if (IS_ERR(mr)) { ib_umem_odp_release(odp); return mr; @@ -494,9 +493,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, if (IS_ERR(umem_odp)) return ERR_CAST(umem_odp); - imr = mlx5_mr_cache_alloc(dev, - &dev->cache.ent[MLX5_IMR_KSM_CACHE_ENTRY], - access_flags); + imr = mlx5_mr_cache_alloc_order(dev, MLX5_IMR_KSM_CACHE_ENTRY, + access_flags); if (IS_ERR(imr)) { ib_umem_odp_release(umem_odp); return imr; -- 2.7.4