net/mlx5e: Fix possible deadlock on mlx5e_tx_timeout_work

author Moshe Shemesh <moshe@nvidia.com>

Wed, 21 Sep 2022 15:45:11 +0000 (18:45 +0300)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 20 Dec 2023 16:01:44 +0000 (17:01 +0100)
author Moshe Shemesh <moshe@nvidia.com>
Wed, 21 Sep 2022 15:45:11 +0000 (18:45 +0300)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 20 Dec 2023 16:01:44 +0000 (17:01 +0100)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h

index 86f2690..20a6bc1 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -818,6 +818,7 @@ enum {
         MLX5E_STATE_DESTROYING,
         MLX5E_STATE_XDP_TX_ENABLED,
         MLX5E_STATE_XDP_ACTIVE,
+       MLX5E_STATE_CHANNELS_ACTIVE,
  };
  
  struct mlx5e_modify_sq_param {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c

index acb4077..c3961c2 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2668,6 +2668,7 @@ void mlx5e_close_channels(struct mlx5e_channels *chs)
  {
         int i;
  
+       ASSERT_RTNL();
         if (chs->ptp) {
                 mlx5e_ptp_close(chs->ptp);
                 chs->ptp = NULL;
@@ -2945,17 +2946,29 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
         if (mlx5e_is_vport_rep(priv))
                 mlx5e_rep_activate_channels(priv);
  
+       set_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state);
+
         mlx5e_wait_channels_min_rx_wqes(&priv->channels);
  
         if (priv->rx_res)
                 mlx5e_rx_res_channels_activate(priv->rx_res, &priv->channels);
  }
  
+static void mlx5e_cancel_tx_timeout_work(struct mlx5e_priv *priv)
+{
+       WARN_ON_ONCE(test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state));
+       if (current_work() != &priv->tx_timeout_work)
+               cancel_work_sync(&priv->tx_timeout_work);
+}
+
  void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv)
  {
         if (priv->rx_res)
                 mlx5e_rx_res_channels_deactivate(priv->rx_res);
  
+       clear_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state);
+       mlx5e_cancel_tx_timeout_work(priv);
+
         if (mlx5e_is_vport_rep(priv))
                 mlx5e_rep_deactivate_channels(priv);
  
@@ -4734,8 +4747,17 @@ static void mlx5e_tx_timeout_work(struct work_struct *work)
         struct net_device *netdev = priv->netdev;
         int i;
  
-       rtnl_lock();
-       mutex_lock(&priv->state_lock);
+       /* Take rtnl_lock to ensure no change in netdev->real_num_tx_queues
+        * through this flow. However, channel closing flows have to wait for
+        * this work to finish while holding rtnl lock too. So either get the
+        * lock or find that channels are being closed for other reason and
+        * this work is not relevant anymore.
+        */
+       while (!rtnl_trylock()) {
+               if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
+                       return;
+               msleep(20);
+       }
  
         if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
                 goto unlock;
@@ -4754,7 +4776,6 @@ static void mlx5e_tx_timeout_work(struct work_struct *work)
         }
  
  unlock:
-       mutex_unlock(&priv->state_lock);
         rtnl_unlock();
  }
author	Moshe Shemesh <moshe@nvidia.com>
	Wed, 21 Sep 2022 15:45:11 +0000 (18:45 +0300)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 20 Dec 2023 16:01:44 +0000 (17:01 +0100)
drivers/net/ethernet/mellanox/mlx5/core/en.h		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx5/core/en_main.c		patch \| blob \| history