net/mlx5: Fix sync reset event handler error flow
authorMoshe Shemesh <moshe@nvidia.com>
Sat, 29 Oct 2022 06:03:48 +0000 (09:03 +0300)
committerSaeed Mahameed <saeedm@nvidia.com>
Tue, 22 Nov 2022 02:14:34 +0000 (18:14 -0800)
When sync reset now event handling fails on mlx5_pci_link_toggle() then
no reset was done. However, since mlx5_cmd_fast_teardown_hca() was
already done, the firmware function is closed and the driver is left
without firmware functionality.

Fix it by setting device error state and reopen the firmware resources.
Reopening is done by the thread that was called for devlink reload
fw_activate as it already holds the devlink lock.

Fixes: 5ec697446f46 ("net/mlx5: Add support for devlink reload action fw activate")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Aya Levin <ayal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c

index 9d908a0..1e46f9a 100644 (file)
@@ -9,7 +9,8 @@ enum {
        MLX5_FW_RESET_FLAGS_RESET_REQUESTED,
        MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
        MLX5_FW_RESET_FLAGS_PENDING_COMP,
-       MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS
+       MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS,
+       MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED
 };
 
 struct mlx5_fw_reset {
@@ -406,7 +407,7 @@ static void mlx5_sync_reset_now_event(struct work_struct *work)
        err = mlx5_pci_link_toggle(dev);
        if (err) {
                mlx5_core_warn(dev, "mlx5_pci_link_toggle failed, no reset done, err %d\n", err);
-               goto done;
+               set_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags);
        }
 
        mlx5_enter_error_state(dev, true);
@@ -482,6 +483,10 @@ int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev)
                goto out;
        }
        err = fw_reset->ret;
+       if (test_and_clear_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags)) {
+               mlx5_unload_one_devl_locked(dev);
+               mlx5_load_one_devl_locked(dev, false);
+       }
 out:
        clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
        return err;