net/mlx5e: xsk: Fix crash on regular rq reactivation
authorDragos Tatulea <dtatulea@nvidia.com>
Mon, 24 Apr 2023 15:19:00 +0000 (18:19 +0300)
committerSaeed Mahameed <saeedm@nvidia.com>
Wed, 26 Jul 2023 21:31:04 +0000 (14:31 -0700)
When the regular rq is reactivated after the XSK socket is closed
it could be reading stale cqes which eventually corrupts the rq.
This leads to no more traffic being received on the regular rq and a
crash on the next close or deactivation of the rq.

Kal Cuttler Conely reported this issue as a crash on the release
path when the xdpsock sample program is stopped (killed) and restarted
in sequence while traffic is running.

This patch flushes all cqes when during the rq flush. The cqe flushing
is done in the reset state of the rq. mlx5e_rq_to_ready code is moved
into the flush function to allow for this.

Fixes: 082a9edf12fe ("net/mlx5e: xsk: Flush RQ on XSK activation to save memory")
Reported-by: Kal Cutter Conley <kal.conley@dectris.com>
Closes: https://lore.kernel.org/xdp-newbies/CAHApi-nUAs4TeFWUDV915CZJo07XVg2Vp63-no7UDfj6wur9nQ@mail.gmail.com
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
drivers/net/ethernet/mellanox/mlx5/core/en_main.c

index defb1efccb78fa5ee067eb7ed2ce8cdd2a603040..1c820119e438f0cd119f891080e16628313971cb 100644 (file)
@@ -1036,7 +1036,23 @@ static int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_s
        return err;
 }
 
-static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state)
+static void mlx5e_flush_rq_cq(struct mlx5e_rq *rq)
+{
+       struct mlx5_cqwq *cqwq = &rq->cq.wq;
+       struct mlx5_cqe64 *cqe;
+
+       if (test_bit(MLX5E_RQ_STATE_MINI_CQE_ENHANCED, &rq->state)) {
+               while ((cqe = mlx5_cqwq_get_cqe_enahnced_comp(cqwq)))
+                       mlx5_cqwq_pop(cqwq);
+       } else {
+               while ((cqe = mlx5_cqwq_get_cqe(cqwq)))
+                       mlx5_cqwq_pop(cqwq);
+       }
+
+       mlx5_cqwq_update_db_record(cqwq);
+}
+
+int mlx5e_flush_rq(struct mlx5e_rq *rq, int curr_state)
 {
        struct net_device *dev = rq->netdev;
        int err;
@@ -1046,6 +1062,10 @@ static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state)
                netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn);
                return err;
        }
+
+       mlx5e_free_rx_descs(rq);
+       mlx5e_flush_rq_cq(rq);
+
        err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
        if (err) {
                netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn);
@@ -1055,13 +1075,6 @@ static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state)
        return 0;
 }
 
-int mlx5e_flush_rq(struct mlx5e_rq *rq, int curr_state)
-{
-       mlx5e_free_rx_descs(rq);
-
-       return mlx5e_rq_to_ready(rq, curr_state);
-}
-
 static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd)
 {
        struct mlx5_core_dev *mdev = rq->mdev;