net/mlx5: Increase FW pre-init timeout for health recovery
authorGavin Li <gavinl@nvidia.com>
Sun, 27 Mar 2022 14:36:44 +0000 (17:36 +0300)
committerSaeed Mahameed <saeedm@nvidia.com>
Tue, 10 May 2022 05:54:00 +0000 (22:54 -0700)
Currently, health recovery will reload driver to recover it from fatal
errors. During the driver's load process, it would wait for FW to set the
pre-init bit for up to 120 seconds, beyond this threshold it would abort
the load process. In some cases, such as a FW upgrade on the DPU, this
timeout period is insufficient, and the user has no way to recover the
host device.

To solve this issue, introduce a new FW pre-init timeout for health
recovery, which is set to 2 hours.

The timeout for devlink reload and probe will use the original one because
they are user triggered flows, and therefore should not have a
significantly long timeout, during which the user command would hang.

Signed-off-by: Gavin Li <gavinl@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
drivers/net/ethernet/mellanox/mlx5/core/devlink.c
drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
drivers/net/ethernet/mellanox/mlx5/core/main.c
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h

index e8789e6..f85166e 100644 (file)
@@ -178,13 +178,13 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a
        *actions_performed = BIT(action);
        switch (action) {
        case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
-               return mlx5_load_one(dev);
+               return mlx5_load_one(dev, false);
        case DEVLINK_RELOAD_ACTION_FW_ACTIVATE:
                if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET)
                        break;
                /* On fw_activate action, also driver is reloaded and reinit performed */
                *actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
-               return mlx5_load_one(dev);
+               return mlx5_load_one(dev, false);
        default:
                /* Unsupported action should not get to this function */
                WARN_ON(1);
index ca1aba8..84df0d5 100644 (file)
@@ -148,7 +148,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
        if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
                complete(&fw_reset->done);
        } else {
-               mlx5_load_one(dev);
+               mlx5_load_one(dev, false);
                devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0,
                                                        BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
                                                        BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
index c1df0d3..d758848 100644 (file)
@@ -10,6 +10,7 @@ struct mlx5_timeouts {
 
 static const u32 tout_def_sw_val[MAX_TIMEOUT_TYPES] = {
        [MLX5_TO_FW_PRE_INIT_TIMEOUT_MS] = 120000,
+       [MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS] = 7200000,
        [MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS] = 20000,
        [MLX5_TO_FW_PRE_INIT_WAIT_MS] = 2,
        [MLX5_TO_FW_INIT_MS] = 2000,
index 1c42ead..257c03e 100644 (file)
@@ -7,6 +7,7 @@
 enum mlx5_timeouts_types {
        /* pre init timeouts (not read from FW) */
        MLX5_TO_FW_PRE_INIT_TIMEOUT_MS,
+       MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS,
        MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS,
        MLX5_TO_FW_PRE_INIT_WAIT_MS,
 
index f28a352..84f75aa 100644 (file)
@@ -1003,7 +1003,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
        mlx5_devcom_unregister_device(dev->priv.devcom);
 }
 
-static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
+static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
 {
        int err;
 
@@ -1018,11 +1018,11 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
 
        /* wait for firmware to accept initialization segments configurations
         */
-       err = wait_fw_init(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT),
+       err = wait_fw_init(dev, timeout,
                           mlx5_tout_ms(dev, FW_PRE_INIT_WARN_MESSAGE_INTERVAL));
        if (err) {
                mlx5_core_err(dev, "Firmware over %llu MS in pre-initializing state, aborting\n",
-                             mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
+                             timeout);
                return err;
        }
 
@@ -1272,7 +1272,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
        mutex_lock(&dev->intf_state_mutex);
        dev->state = MLX5_DEVICE_STATE_UP;
 
-       err = mlx5_function_setup(dev, true);
+       err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
        if (err)
                goto err_function;
 
@@ -1336,9 +1336,10 @@ out:
        mutex_unlock(&dev->intf_state_mutex);
 }
 
-int mlx5_load_one(struct mlx5_core_dev *dev)
+int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery)
 {
        int err = 0;
+       u64 timeout;
 
        mutex_lock(&dev->intf_state_mutex);
        if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
@@ -1348,7 +1349,11 @@ int mlx5_load_one(struct mlx5_core_dev *dev)
        /* remove any previous indication of internal error */
        dev->state = MLX5_DEVICE_STATE_UP;
 
-       err = mlx5_function_setup(dev, false);
+       if (recovery)
+               timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
+       else
+               timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
+       err = mlx5_function_setup(dev, timeout);
        if (err)
                goto err_function;
 
@@ -1719,7 +1724,7 @@ static void mlx5_pci_resume(struct pci_dev *pdev)
 
        mlx5_pci_trace(dev, "Enter, loading driver..\n");
 
-       err = mlx5_load_one(dev);
+       err = mlx5_load_one(dev, false);
 
        mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
                       !err ? "recovered" : "Failed");
@@ -1807,7 +1812,7 @@ static int mlx5_resume(struct pci_dev *pdev)
 {
        struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
 
-       return mlx5_load_one(dev);
+       return mlx5_load_one(dev, false);
 }
 
 static const struct pci_device_id mlx5_core_pci_table[] = {
@@ -1852,7 +1857,7 @@ int mlx5_recover_device(struct mlx5_core_dev *dev)
                        return -EIO;
        }
 
-       return mlx5_load_one(dev);
+       return mlx5_load_one(dev, true);
 }
 
 static struct pci_driver mlx5_core_driver = {
index a9b2d6e..9026be1 100644 (file)
@@ -290,7 +290,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev);
 int mlx5_init_one(struct mlx5_core_dev *dev);
 void mlx5_uninit_one(struct mlx5_core_dev *dev);
 void mlx5_unload_one(struct mlx5_core_dev *dev);
-int mlx5_load_one(struct mlx5_core_dev *dev);
+int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery);
 
 int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out);