net/mlx5: Check with FW that sync reset completed successfully
authorMoshe Shemesh <moshe@nvidia.com>
Wed, 31 May 2023 10:50:21 +0000 (13:50 +0300)
committerSaeed Mahameed <saeedm@nvidia.com>
Mon, 14 Aug 2023 21:40:21 +0000 (14:40 -0700)
Even if the PF driver had no error on his part of the sync reset flow,
the firmware can see wider picture as it syncs all the PFs in the flow.
So add at end of sync reset flow check with firmware by reading MFRL
register and initialization segment that the flow had no issue from
firmware point of view too.

Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
drivers/net/ethernet/mellanox/mlx5/core/devlink.c
drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h
include/linux/mlx5/mlx5_ifc.h

index 3d82ec8..af8460b 100644 (file)
@@ -212,6 +212,9 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a
                /* On fw_activate action, also driver is reloaded and reinit performed */
                *actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
                ret = mlx5_load_one_devl_locked(dev, true);
+               if (ret)
+                       return ret;
+               ret = mlx5_fw_reset_verify_fw_complete(dev, extack);
                break;
        default:
                /* Unsupported action should not get to this function */
index 4804990..e87766f 100644 (file)
@@ -127,17 +127,23 @@ static int mlx5_fw_reset_get_reset_state_err(struct mlx5_core_dev *dev,
        if (mlx5_reg_mfrl_query(dev, NULL, NULL, &reset_state))
                goto out;
 
+       if (!reset_state)
+               return 0;
+
        switch (reset_state) {
        case MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION:
        case MLX5_MFRL_REG_RESET_STATE_RESET_IN_PROGRESS:
-               NL_SET_ERR_MSG_MOD(extack, "Sync reset was already triggered");
+               NL_SET_ERR_MSG_MOD(extack, "Sync reset still in progress");
                return -EBUSY;
-       case MLX5_MFRL_REG_RESET_STATE_TIMEOUT:
-               NL_SET_ERR_MSG_MOD(extack, "Sync reset got timeout");
+       case MLX5_MFRL_REG_RESET_STATE_NEG_TIMEOUT:
+               NL_SET_ERR_MSG_MOD(extack, "Sync reset negotiation timeout");
                return -ETIMEDOUT;
        case MLX5_MFRL_REG_RESET_STATE_NACK:
                NL_SET_ERR_MSG_MOD(extack, "One of the hosts disabled reset");
                return -EPERM;
+       case MLX5_MFRL_REG_RESET_STATE_UNLOAD_TIMEOUT:
+               NL_SET_ERR_MSG_MOD(extack, "Sync reset unload timeout");
+               return -ETIMEDOUT;
        }
 
 out:
@@ -151,7 +157,7 @@ int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel,
        struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
        u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {};
        u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {};
-       int err;
+       int err, rst_res;
 
        set_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
 
@@ -164,13 +170,34 @@ int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel,
                return 0;
 
        clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
-       if (err == -EREMOTEIO && MLX5_CAP_MCAM_FEATURE(dev, reset_state))
-               return mlx5_fw_reset_get_reset_state_err(dev, extack);
+       if (err == -EREMOTEIO && MLX5_CAP_MCAM_FEATURE(dev, reset_state)) {
+               rst_res = mlx5_fw_reset_get_reset_state_err(dev, extack);
+               return rst_res ? rst_res : err;
+       }
 
        NL_SET_ERR_MSG_MOD(extack, "Sync reset command failed");
        return mlx5_cmd_check(dev, err, in, out);
 }
 
+int mlx5_fw_reset_verify_fw_complete(struct mlx5_core_dev *dev,
+                                    struct netlink_ext_ack *extack)
+{
+       u8 rst_state;
+       int err;
+
+       err = mlx5_fw_reset_get_reset_state_err(dev, extack);
+       if (err)
+               return err;
+
+       rst_state = mlx5_get_fw_rst_state(dev);
+       if (!rst_state)
+               return 0;
+
+       mlx5_core_err(dev, "Sync reset did not complete, state=%d\n", rst_state);
+       NL_SET_ERR_MSG_MOD(extack, "Sync reset did not complete successfully");
+       return rst_state;
+}
+
 int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev)
 {
        return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false);
index c574655..ea527d0 100644 (file)
@@ -12,6 +12,8 @@ int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel,
 int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev);
 
 int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev);
+int mlx5_fw_reset_verify_fw_complete(struct mlx5_core_dev *dev,
+                                    struct netlink_ext_ack *extack);
 void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev);
 void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev);
 void mlx5_drain_fw_reset(struct mlx5_core_dev *dev);
index 87fd6f9..9aed7e9 100644 (file)
@@ -10858,8 +10858,9 @@ enum {
        MLX5_MFRL_REG_RESET_STATE_IDLE = 0,
        MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION = 1,
        MLX5_MFRL_REG_RESET_STATE_RESET_IN_PROGRESS = 2,
-       MLX5_MFRL_REG_RESET_STATE_TIMEOUT = 3,
+       MLX5_MFRL_REG_RESET_STATE_NEG_TIMEOUT = 3,
        MLX5_MFRL_REG_RESET_STATE_NACK = 4,
+       MLX5_MFRL_REG_RESET_STATE_UNLOAD_TIMEOUT = 5,
 };
 
 enum {