net/mlx5: Add retry mechanism to the command entry index allocation
authorEran Ben Elisha <eranbe@nvidia.com>
Mon, 31 Aug 2020 12:04:35 +0000 (15:04 +0300)
committerSaeed Mahameed <saeedm@nvidia.com>
Fri, 2 Oct 2020 17:59:55 +0000 (10:59 -0700)
It is possible that new command entry index allocation will temporarily
fail. The new command holds the semaphore, so it means that a free entry
should be ready soon. Add one second retry mechanism before returning an
error.

Patch "net/mlx5: Avoid possible free of command entry while timeout comp
handler" increase the possibility to bump into this temporarily failure
as it delays the entry index release for non-callback commands.

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Eran Ben Elisha <eranbe@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
drivers/net/ethernet/mellanox/mlx5/core/cmd.c

index 37dae95e61d5fe8612a06671bf71db83a36ab512..2b597ac365f84672d2da86bf0c5edf985424177e 100644 (file)
@@ -883,6 +883,25 @@ static bool opcode_allowed(struct mlx5_cmd *cmd, u16 opcode)
        return cmd->allowed_opcode == opcode;
 }
 
+static int cmd_alloc_index_retry(struct mlx5_cmd *cmd)
+{
+       unsigned long alloc_end = jiffies + msecs_to_jiffies(1000);
+       int idx;
+
+retry:
+       idx = cmd_alloc_index(cmd);
+       if (idx < 0 && time_before(jiffies, alloc_end)) {
+               /* Index allocation can fail on heavy load of commands. This is a temporary
+                * situation as the current command already holds the semaphore, meaning that
+                * another command completion is being handled and it is expected to release
+                * the entry index soon.
+                */
+               cpu_relax();
+               goto retry;
+       }
+       return idx;
+}
+
 static void cmd_work_handler(struct work_struct *work)
 {
        struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work);
@@ -900,7 +919,7 @@ static void cmd_work_handler(struct work_struct *work)
        sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem;
        down(sem);
        if (!ent->page_queue) {
-               alloc_ret = cmd_alloc_index(cmd);
+               alloc_ret = cmd_alloc_index_retry(cmd);
                if (alloc_ret < 0) {
                        mlx5_core_err_rl(dev, "failed to allocate command entry\n");
                        if (ent->callback) {