net/mlx5: Fix handling of entry refcount when command is not issued to FW
authorMoshe Shemesh <moshe@nvidia.com>
Thu, 17 Nov 2022 07:07:20 +0000 (09:07 +0200)
committerSaeed Mahameed <saeedm@nvidia.com>
Tue, 22 Nov 2022 02:14:34 +0000 (18:14 -0800)
In case command interface is down, or the command is not allowed, driver
did not increment the entry refcount, but might have decrement as part
of forced completion handling.

Fix that by always increment and decrement the refcount to make it
symmetric for all flows.

Fixes: 50b2412b7e78 ("net/mlx5: Avoid possible free of command entry while timeout comp handler")
Signed-off-by: Eran Ben Elisha <eranbe@nvidia.com>
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reported-by: Jack Wang <jinpu.wang@ionos.com>
Tested-by: Jack Wang <jinpu.wang@ionos.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
drivers/net/ethernet/mellanox/mlx5/core/cmd.c

index df3e284..74bd05e 100644 (file)
@@ -1005,6 +1005,7 @@ static void cmd_work_handler(struct work_struct *work)
                cmd_ent_get(ent);
        set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state);
 
+       cmd_ent_get(ent); /* for the _real_ FW event on completion */
        /* Skip sending command to fw if internal error */
        if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, ent->op)) {
                ent->ret = -ENXIO;
@@ -1012,7 +1013,6 @@ static void cmd_work_handler(struct work_struct *work)
                return;
        }
 
-       cmd_ent_get(ent); /* for the _real_ FW event on completion */
        /* ring doorbell after the descriptor is valid */
        mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx);
        wmb();
@@ -1661,8 +1661,8 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force
                                cmd_ent_put(ent); /* timeout work was canceled */
 
                        if (!forced || /* Real FW completion */
-                           pci_channel_offline(dev->pdev) || /* FW is inaccessible */
-                           dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+                            mlx5_cmd_is_down(dev) || /* No real FW completion is expected */
+                            !opcode_allowed(cmd, ent->op))
                                cmd_ent_put(ent);
 
                        ent->ts2 = ktime_get_ns();